Welcome to mirror list, hosted at ThFree Co, Russian Federation.

snt2cooc-reduce-mem-preprocess.cpp « src « mgizapp - github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 8f057967da5842de2ff87ec8f26f5e0bba4f735b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <iostream>
#include <string>
#include <strstream>
#include <fstream>
#include <map>
#include <vector>
#include <set>
#include <cstdio>
#include <cstdlib>

using namespace std;

void readVoc(istream&in,map<string,string>&voc)
{
    string line,s1,s2;
    voc["1"]="UNK";
    if( !in )cerr <<"Vocabulary does not exist.\n";
    while(getline(in,line))
    {
        istrstream eingabe(line.c_str());
        if( !(eingabe>>s1>>s2))
            cerr << "ERROR in vocabulary '" << line << "'\n";
        voc[s1]=s2;
    }
}

int maxElems=0;
int main(int argc,char **argv)
{
    if( argc!=4&&argc!=5 )
    {
        cerr << "Usage: " << argv[0] << " output vcb1 vcb2 snt12 \n";
        cerr << "Converts GIZA++ snt-format into plain text.\n";
        exit(1);
    }
    bool counts=0;
    if( argc==6 )
    {
        if(string(argv[4])!="-counts")
            cerr << "ERROR: wrong option " << argv[5] << endl;
        counts=1;
        maxElems=10000000;
    }
    ifstream v1(argv[1]),v2(argv[2]),t(argv[3]);
    map<string,string>voc1,voc2;
    readVoc(v1,voc1);
    readVoc(v2,voc2);
    string line1,line2,line3;
    int nLine=0;
    int totalElems=0;
    while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
    {
        istrstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str());
        double count;
        string word;
        eingabe1>>count;
        vector<int>l1,l2;
        while(eingabe2>>word)
            l1.push_back(atoi(word.c_str()));
        while(eingabe3>>word)
            l2.push_back(atoi(word.c_str()));
        if( ((++nLine)%1000)==0 )
            cerr << "line " << nLine << '\n';
        for(unsigned int j=0; j<l2.size(); ++j)
        {
        		cout << 0 << " " << l2[j] << endl;
        }
        for(unsigned int i=0; i<l1.size(); ++i)
        {
            for(unsigned int j=0; j<l2.size(); ++j)
            {
            		cout << l1[i] << " " << l2[j] << endl;
            }
        }
    }
    cerr << "END.\n";
}