1 files changed, 117 insertions, 0 deletions
diff --git a/experimental/dual-model/MGIZA/src/plain2snt.cpp b/experimental/dual-model/MGIZA/src/plain2snt.cpp
new file mode 100644
index 0000000..6e8f0b7
--- /dev/null
+++ b/experimental/dual-model/MGIZA/src/plain2snt.cpp
@@ -0,0 +1,117 @@
+#include <iostream>
+#include <string>
+#include <strstream>
+#include <fstream>
+#include <map>
+#include <vector>
+#include <stdlib.h>
+#include <stdio.h>
+
+using namespace std;
+
+int main(int argc,char**argv)
+{
+  vector<double>weights;
+  vector<string>filenames;
+  for(int i=1;i<argc;++i)
+    if(string(argv[i])=="-weight")
+      weights.push_back(atof(argv[++i]));
+    else
+      filenames.push_back(argv[i]);
+  
+  if((filenames.size()%2)==1||filenames.size()==0 )
+    {
+      cerr << argv[0] << " txt1 txt2 [txt3 txt4 -weight w]\n";
+      cerr << " Converts plain text into GIZA++ snt-format.\n";
+      exit(1);
+    }
+  string line1,line2,word;
+  map<string,int> v1,v2;
+  map<string,int> id1,id2;
+  vector<string> iid1(2),iid2(2);
+  
+  string w1(filenames[0]);
+  string w2(filenames[1]);
+  
+  if( w1.length()>4&&w2.length()>4&&((w1.substr(w1.length()-4,w1.length())==".tok" && w2.substr(w2.length()-4,w2.length())==".tok" )||
+				     (w1.substr(w1.length()-4,w1.length())==".txt" && w2.substr(w2.length()-4,w2.length())==".txt" ) ))
+    {
+      w1=w1.substr(0,w1.length()-4);
+      w2=w2.substr(0,w2.length()-4);
+      cerr << "w1:"<< w1 << " w2:" << w2 << endl;
+    } 
+      
+
+  string vocab1(w1),vocab2(w2),snt1,snt2;
+  unsigned int slashpos=vocab1.rfind('/')+1;
+  if( slashpos>=vocab1.length() ) slashpos=0;
+  string vocab1x(vocab1.substr(slashpos,vocab1.length()));
+  cout << vocab1 << " -> " << vocab1x << endl;
+  slashpos=vocab2.rfind('/')+1;
+  if( slashpos>=vocab2.length() ) slashpos=0;
+  string vocab2x(vocab2.substr(slashpos,vocab2.length()));
+  cout << vocab2 << " -> " << vocab2x << endl;  
+  snt1=vocab1+"_"+vocab2x+string(".snt");
+  snt2=vocab2+"_"+vocab1x+string(".snt");
+  vocab1+=string(".vcb");
+  vocab2+=string(".vcb");
+
+  ofstream ovocab1(vocab1.c_str()),ovocab2(vocab2.c_str()),osnt1(snt1.c_str()),osnt2(snt2.c_str());
+  for(unsigned int i=0;i<filenames.size();i+=2)
+    {
+      ifstream i1(filenames[i].c_str()),i2(filenames[i+1].c_str());
+      if(!i1)cerr << "WARNING: " << filenames[i] << " cannot be read.\n";
+      if(!i2)cerr << "WARNING: " << filenames[i+1] << " cannot be read.\n";
+      while(getline(i1,line1) && getline(i2,line2) )
+	{
+	  vector<string> t1,t2;
+	  istrstream ii1(line1.c_str());
+	  while(ii1>>word)
+	    {
+	      t1.push_back(word);
+	      v1[word]++;
+	      if( id1.find(word)==id1.end() )
+		{
+		  iid1.push_back(word);
+		  id1[word]=iid1.size()-1;
+		}
+	    }
+	  istrstream ii2(line2.c_str());
+	  while(ii2>>word)
+	    {
+	      t2.push_back(word);
+	      v2[word]++;
+	      if( id2.find(word)==id2.end() )
+		{
+		  iid2.push_back(word);
+		  id2[word]=iid2.size()-1;
+		}
+	    }
+	  double w=1.0;
+	  if( i/2<weights.size() )
+	    w=weights[i/2];
+	  if( t1.size()&&t2.size() )
+	    {
+	      osnt1 << w << "\n";
+	      for(unsigned int j=0;j<t1.size();++j)osnt1 << id1[t1[j]] << ' ';
+	      osnt1 << '\n';
+	      for(unsigned int j=0;j<t2.size();++j)osnt1 << id2[t2[j]] << ' ';
+	      osnt1 << '\n';
+
+	      osnt2 << w << "\n";
+	      for(unsigned int j=0;j<t2.size();++j)osnt2 << id2[t2[j]] << ' ';
+	      osnt2 << '\n';
+	      for(unsigned int j=0;j<t1.size();++j)osnt2 << id1[t1[j]] << ' ';
+	      osnt2 << '\n';
+	    }
+	  else
+	    cerr << "WARNING: filtered out empty sentence (source: " << filenames[i] << " " << t1.size() <<
+	      " target: " << filenames[i+1] << " " << t2.size() << ").\n";
+	}
+    }
+  
+  for(unsigned int i=2;i<iid1.size();++i)
+    ovocab1 << i << ' ' << iid1[i] << ' ' << v1[iid1[i]] << '\n';
+  for(unsigned int i=2;i<iid2.size();++i)
+    ovocab2 << i << ' ' << iid2[i] << ' ' << v2[iid2[i]] << '\n';
+}