Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'experimental/dual-model/MGIZA/src/plain2snt.cpp')
-rw-r--r--experimental/dual-model/MGIZA/src/plain2snt.cpp117
1 files changed, 117 insertions, 0 deletions
diff --git a/experimental/dual-model/MGIZA/src/plain2snt.cpp b/experimental/dual-model/MGIZA/src/plain2snt.cpp
new file mode 100644
index 0000000..6e8f0b7
--- /dev/null
+++ b/experimental/dual-model/MGIZA/src/plain2snt.cpp
@@ -0,0 +1,117 @@
+#include <iostream>
+#include <string>
+#include <strstream>
+#include <fstream>
+#include <map>
+#include <vector>
+#include <stdlib.h>
+#include <stdio.h>
+
+using namespace std;
+
+int main(int argc,char**argv)
+{
+ vector<double>weights;
+ vector<string>filenames;
+ for(int i=1;i<argc;++i)
+ if(string(argv[i])=="-weight")
+ weights.push_back(atof(argv[++i]));
+ else
+ filenames.push_back(argv[i]);
+
+ if((filenames.size()%2)==1||filenames.size()==0 )
+ {
+ cerr << argv[0] << " txt1 txt2 [txt3 txt4 -weight w]\n";
+ cerr << " Converts plain text into GIZA++ snt-format.\n";
+ exit(1);
+ }
+ string line1,line2,word;
+ map<string,int> v1,v2;
+ map<string,int> id1,id2;
+ vector<string> iid1(2),iid2(2);
+
+ string w1(filenames[0]);
+ string w2(filenames[1]);
+
+ if( w1.length()>4&&w2.length()>4&&((w1.substr(w1.length()-4,w1.length())==".tok" && w2.substr(w2.length()-4,w2.length())==".tok" )||
+ (w1.substr(w1.length()-4,w1.length())==".txt" && w2.substr(w2.length()-4,w2.length())==".txt" ) ))
+ {
+ w1=w1.substr(0,w1.length()-4);
+ w2=w2.substr(0,w2.length()-4);
+ cerr << "w1:"<< w1 << " w2:" << w2 << endl;
+ }
+
+
+ string vocab1(w1),vocab2(w2),snt1,snt2;
+ unsigned int slashpos=vocab1.rfind('/')+1;
+ if( slashpos>=vocab1.length() ) slashpos=0;
+ string vocab1x(vocab1.substr(slashpos,vocab1.length()));
+ cout << vocab1 << " -> " << vocab1x << endl;
+ slashpos=vocab2.rfind('/')+1;
+ if( slashpos>=vocab2.length() ) slashpos=0;
+ string vocab2x(vocab2.substr(slashpos,vocab2.length()));
+ cout << vocab2 << " -> " << vocab2x << endl;
+ snt1=vocab1+"_"+vocab2x+string(".snt");
+ snt2=vocab2+"_"+vocab1x+string(".snt");
+ vocab1+=string(".vcb");
+ vocab2+=string(".vcb");
+
+ ofstream ovocab1(vocab1.c_str()),ovocab2(vocab2.c_str()),osnt1(snt1.c_str()),osnt2(snt2.c_str());
+ for(unsigned int i=0;i<filenames.size();i+=2)
+ {
+ ifstream i1(filenames[i].c_str()),i2(filenames[i+1].c_str());
+ if(!i1)cerr << "WARNING: " << filenames[i] << " cannot be read.\n";
+ if(!i2)cerr << "WARNING: " << filenames[i+1] << " cannot be read.\n";
+ while(getline(i1,line1) && getline(i2,line2) )
+ {
+ vector<string> t1,t2;
+ istrstream ii1(line1.c_str());
+ while(ii1>>word)
+ {
+ t1.push_back(word);
+ v1[word]++;
+ if( id1.find(word)==id1.end() )
+ {
+ iid1.push_back(word);
+ id1[word]=iid1.size()-1;
+ }
+ }
+ istrstream ii2(line2.c_str());
+ while(ii2>>word)
+ {
+ t2.push_back(word);
+ v2[word]++;
+ if( id2.find(word)==id2.end() )
+ {
+ iid2.push_back(word);
+ id2[word]=iid2.size()-1;
+ }
+ }
+ double w=1.0;
+ if( i/2<weights.size() )
+ w=weights[i/2];
+ if( t1.size()&&t2.size() )
+ {
+ osnt1 << w << "\n";
+ for(unsigned int j=0;j<t1.size();++j)osnt1 << id1[t1[j]] << ' ';
+ osnt1 << '\n';
+ for(unsigned int j=0;j<t2.size();++j)osnt1 << id2[t2[j]] << ' ';
+ osnt1 << '\n';
+
+ osnt2 << w << "\n";
+ for(unsigned int j=0;j<t2.size();++j)osnt2 << id2[t2[j]] << ' ';
+ osnt2 << '\n';
+ for(unsigned int j=0;j<t1.size();++j)osnt2 << id1[t1[j]] << ' ';
+ osnt2 << '\n';
+ }
+ else
+ cerr << "WARNING: filtered out empty sentence (source: " << filenames[i] << " " << t1.size() <<
+ " target: " << filenames[i+1] << " " << t2.size() << ").\n";
+ }
+ }
+
+ for(unsigned int i=2;i<iid1.size();++i)
+ ovocab1 << i << ' ' << iid1[i] << ' ' << v1[iid1[i]] << '\n';
+ for(unsigned int i=2;i<iid2.size();++i)
+ ovocab2 << i << ' ' << iid2[i] << ' ' << v2[iid2[i]] << '\n';
+}