diff options
Diffstat (limited to 'mgizapp/src/snt2cooc.cpp')
-rw-r--r-- | mgizapp/src/snt2cooc.cpp | 111 |
1 files changed, 111 insertions, 0 deletions
diff --git a/mgizapp/src/snt2cooc.cpp b/mgizapp/src/snt2cooc.cpp new file mode 100644 index 0000000..0eb2aaf --- /dev/null +++ b/mgizapp/src/snt2cooc.cpp @@ -0,0 +1,111 @@ +#include <iostream> +#include <string> +#include <strstream> +#include <fstream> +#include <map> +#include <vector> +#include <set> +#include <stdio.h> +#include <stdlib.h> + +using namespace std; + +void readVoc(istream&in,map<string,string>&voc) +{ + string line,s1,s2; + voc["1"]="UNK"; + if( !in )cerr <<"Vocabulary does not exist.\n"; + while(getline(in,line)) + { + istrstream eingabe(line.c_str()); + if( !(eingabe>>s1>>s2)) + cerr << "ERROR in vocabulary '" << line << "'\n"; + voc[s1]=s2; + } +} + +int maxElems=0; +int main(int argc,char **argv) +{ + if( argc!=5&&argc!=6 ) + { + cerr << "Usage: " << argv[0] << " output vcb1 vcb2 snt12 \n"; + cerr << "Converts GIZA++ snt-format into plain text.\n"; + exit(1); + } + bool counts=0; + if( argc==6 ) + { + if(string(argv[5])!="-counts") + cerr << "ERROR: wrong option " << argv[6] << endl; + counts=1; + maxElems=10000000; + } + ifstream v1(argv[2]),v2(argv[3]),t(argv[4]); + ofstream ou(argv[1]); + map<string,string>voc1,voc2; + readVoc(v1,voc1); + readVoc(v2,voc2); + string line1,line2,line3; + vector<map<int,int> > vsi(voc1.size()+1000); + int nLine=0; + int totalElems=0; + while(getline(t,line1)&&getline(t,line2)&&getline(t,line3)) + { + istrstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str()); + double count; + string word; + eingabe1>>count; + vector<int>l1,l2; + while(eingabe2>>word) + l1.push_back(atoi(word.c_str())); + while(eingabe3>>word) + l2.push_back(atoi(word.c_str())); + if( ((++nLine)%1000)==0 ) + cerr << "line " << nLine << '\n'; + totalElems-=vsi[0].size(); + for(unsigned int j=0;j<l2.size();++j) + vsi[0][l2[j]]++; + totalElems+=vsi[0].size(); + for(unsigned int i=0;i<l1.size();++i) + { + if( l1[i]>=int(vsi.size()) ) + { + cerr << "I have to resize: " << l1[i] << endl; + vsi.resize(l1[i]+1); + } + map<int,int>&theset=vsi[l1[i]]; + totalElems-=theset.size(); + for(unsigned int j=0;j<l2.size();++j) + theset[l2[j]]++; + totalElems+=theset.size(); + } + if( totalElems>maxElems&&maxElems ) + { + cerr << "INFO: print out " << totalElems << " entries.\n"; + for(unsigned int i=0;i<vsi.size();++i) + for(map<int,int>::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j) + { + if(counts==1 ) + ou << j->second << " " << i << " " << j->first << '\n'; + else + ou << i << " " << j->first << '\n'; + } + totalElems=0; + vsi.clear(); + vsi.resize(voc1.size()+1000); + } + } + cerr << "END.\n"; + for(unsigned int i=0;i<vsi.size();++i) + for(map<int,int>::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j) + { + if(counts==1 ) + ou << j->second << " " << i << " " << j->first << '\n'; + else + ou << i << " " << j->first << '\n'; + } + ou.flush(); + ou.close(); +} + |