Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'mgizapp/src/snt2plain.cpp')
-rw-r--r--mgizapp/src/snt2plain.cpp93
1 files changed, 93 insertions, 0 deletions
diff --git a/mgizapp/src/snt2plain.cpp b/mgizapp/src/snt2plain.cpp
new file mode 100644
index 0000000..23dacbe
--- /dev/null
+++ b/mgizapp/src/snt2plain.cpp
@@ -0,0 +1,93 @@
+#include <iostream>
+#include <string>
+#include <strstream>
+#include <fstream>
+#include <map>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h>
+
+
+using namespace std;
+
+void readVoc(istream&in,map<string,string>&voc)
+{
+ string line,s1,s2;
+ voc["1"]="UNK";
+ if( !in )cerr <<"Vocabulary does not exist.\n";
+ while(getline(in,line))
+ {
+ istrstream eingabe(line.c_str());
+ if( !(eingabe>>s1>>s2))
+ cerr << "ERROR in vocabulary '" << line << "'\n";
+ voc[s1]=s2;
+ }
+}
+
+int main(int argc,char **argv)
+{
+ if( argc!=5&&argc!=6 )
+ {
+ cerr << "Usage: " << argv[0] << " vcb1 vcb2 snt12 output_prefix [ -counts ]\n";
+ cerr << "Converts GIZA++ snt-format into plain text.\n";
+ exit(1);
+ }
+ bool counts=0;
+ if( argc==6 )
+ {
+ if(string(argv[5])!="-counts")
+ cerr << "ERROR: wrong option " << argv[5] << endl;
+ counts=1;
+ }
+ ifstream v1(argv[1]),v2(argv[2]),t(argv[3]);
+ string prefix(argv[4]);
+ string outfil1=prefix+"1.txt";
+ string outfil2=prefix+"2.txt";
+ ofstream out1(outfil1.c_str());
+ ofstream out2(outfil2.c_str());
+ map<string,string>voc1,voc2;
+ readVoc(v1,voc1);
+ readVoc(v2,voc2);
+ int source=0,target=0;
+ string line1,line2,line3;
+ int printed=0;
+ while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
+ {
+ istrstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str());
+ double count;
+ string word;
+ eingabe1>>count;
+ vector<string>l1,l2;
+ while(eingabe2>>word)
+ l1.push_back(word);
+ while(eingabe3>>word)
+ l2.push_back(word);
+ if( counts )
+ cout << count << '\n';
+ for(unsigned int p=0;p<l1.size();p++)
+ {
+ if(voc1.count(l1[p])==0)
+ {
+ if( printed++==0)
+ cerr << "ERROR: source vocabulary entry " << l1[p] << " unknown.\n";
+ out1 << l1[p]<<' ';
+ }
+ else
+ out1 << voc1[l1[p]] << ' ';
+ source++;
+ }
+ for(unsigned int p=0;p<l2.size();p++)
+ {
+ if(voc2.count(l2[p])==0)
+ {
+ if( printed++ ==0)
+ cerr << "ERROR: target vocabulary entry " << l2[p] << " unknown.\n";
+ out2 <<l2[p]<<' ';
+ }
+ out2 << voc2[l2[p]] << ' ';
+ target++;
+ }
+ out1<<'\n';
+ out2<<'\n';
+ }
+}