Welcome to mirror list, hosted at ThFree Co, Russian Federation.

snt2cooc.cpp « src « MGIZA « alignment-enabled « experimental - github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 48328dc0d50eff939989f76523f390f390c0817d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#include <iostream>
#include <string>
#include <strstream>
#include <fstream>
#include <map>
#include <vector>
#include <set>
#include <cstdio>
#include <cstdlib>

using namespace std;

void readVoc(istream&in,map<string,string>&voc)
{
  string line,s1,s2; 
  voc["1"]="UNK";
  if( !in )cerr <<"Vocabulary does not exist.\n";
  while(getline(in,line))
    {
      istrstream eingabe(line.c_str()); 
      if( !(eingabe>>s1>>s2))
	cerr << "ERROR in vocabulary '" << line << "'\n";
      voc[s1]=s2;
    }
}

int maxElems=0;
int main(int argc,char **argv)
{
  if( argc!=5&&argc!=6 )
    {
      cerr << "Usage: " << argv[0] << " output vcb1 vcb2 snt12 \n";
      cerr << "Converts GIZA++ snt-format into plain text.\n";
      exit(1);
    }
  bool counts=0;
  if( argc==6 )
    {
      if(string(argv[5])!="-counts")
	cerr << "ERROR: wrong option " << argv[6] << endl;
      counts=1;
	maxElems=10000000;
    }
  ifstream v1(argv[2]),v2(argv[3]),t(argv[4]);
  ofstream ou(argv[1]);
  map<string,string>voc1,voc2;
  readVoc(v1,voc1);
  readVoc(v2,voc2);
  string line1,line2,line3;
  vector<map<int,int> > vsi(voc1.size()+1000);
  int nLine=0;
  int totalElems=0;
  while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
    {
      istrstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str());
      double count;
      string word;
      eingabe1>>count;
      vector<int>l1,l2;
      while(eingabe2>>word)
	l1.push_back(atoi(word.c_str()));
      while(eingabe3>>word)
	l2.push_back(atoi(word.c_str()));
      if( ((++nLine)%1000)==0 )
	cerr << "line " << nLine << '\n'; 
      totalElems-=vsi[0].size();
      for(unsigned int j=0;j<l2.size();++j)
	vsi[0][l2[j]]++;
      totalElems+=vsi[0].size();
      for(unsigned int i=0;i<l1.size();++i)
	{
	  if( l1[i]>=int(vsi.size()) )
	    {
	      cerr << "I have to resize: " << l1[i] << endl;
	      vsi.resize(l1[i]+1);
	    }
	  map<int,int>&theset=vsi[l1[i]];
	  totalElems-=theset.size();
	  for(unsigned int j=0;j<l2.size();++j)
	    theset[l2[j]]++;
	  totalElems+=theset.size();
	}
      if( totalElems>maxElems&&maxElems )
	{
	  cerr << "INFO: print out " << totalElems << " entries.\n";
	  for(unsigned int i=0;i<vsi.size();++i)
	    for(map<int,int>::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j)
	      {
		if(counts==1 )
		  ou << j->second << " " << i << " " << j->first << '\n';
		else
		  ou << i << " " << j->first << '\n';
	      }
	  totalElems=0;
	  vsi.clear();
	  vsi.resize(voc1.size()+1000);
	}
    }
  cerr << "END.\n";
  for(unsigned int i=0;i<vsi.size();++i)
    for(map<int,int>::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j)
      {
	if(counts==1 )
	  ou << j->second << " " << i << " " << j->first << '\n';
	else
	  ou << i << " " << j->first << '\n';
      }
   ou.flush();
   ou.close();
}