Welcome to mirror list, hosted at ThFree Co, Russian Federation.

shared.cpp « eppex « contrib - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 670df1c0f90171cebcd557bfa726975cdab41c9e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
/**
 * Implementation of functionality shared between counter, eppex and
 * (not yet finished) memscoring eppex.
 *
 * (C) Moses: http://www.statmt.org/moses/
 * (C) Ceslav Przywara, UFAL MFF UK, 2011
 *
 * $Id$
 */

#include <string.h>
#include <boost/tokenizer.hpp>
#include <iostream>

#include "typedefs.h"
#include "phrase-extract.h"
#include "shared.h"


std::string get_lossy_counting_params_format(void) {
    return "\n"
           "You may specify separate Lossy Counter (LC) for each phrase length or\n"
           "use shared LC for all phrase pairs with length from given inclusive interval.\n"
           "Every LC is defined by parameter in form phrase-length:error:support, where:\n"
           "  phrase-length ... a single number (eg. 2) or interval (eg. 2-4)\n"
           "  error         ... error parameter for lossy counting\n"
           "  support       ... support parameter for lossy counting\n"
           "\n"
           "Example of LC params: 1:0:0 2-4:1e-7:4e-7 5-7:2e-8:8e-8\n"
           "   - phrase pairs of length 1 will NOT be pruned\n"
           "   - phrase pairs of length from 2 to 4 (inclusive) will be pruned altogether by LC\n"
           "     with parameters support=4e-7 and error=1e-7\n"
           "   - phrase pairs of length from 5 to 7 (inclusive) will be pruned altogether by LC\n"
           "     with parameters support=8e-8 and error=2e-8\n"
           "   - max phrase length extracted will be set to 7\n"
           "\n"
           "Note: there has to be Lossy Counter defined for every phrase pair length\n"
           "up to the maximum phrase length! Following will not work: 1:0:0 5-7:2e-8:8e-8\n"
           "\n"
           "To count phrase pairs by their length a separate program (counter) may be used.\n"
           "\n"
    ;
}

bool parse_lossy_counting_params(const std::string& param) {

    // See: http://www.boost.org/doc/libs/1_42_0/libs/tokenizer/char_separator.htm
    boost::char_separator<char> separators(",:");
    boost::tokenizer<boost::char_separator<char> > tokens(param, separators);
    boost::tokenizer<boost::char_separator<char> >::iterator iter = tokens.begin();

    std::string interval = *iter;

    if ( ++iter == tokens.end() ) {
        std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": invalid format, missing error and support parameters specification!" << std::endl;
        return false;
    }
    PhrasePairsLossyCounter::error_t error = atof((*iter).c_str());

    if ( ++iter == tokens.end() ) {
        std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": invalid format, missing support parameter specification!" << std::endl;
        return false;
    }
    PhrasePairsLossyCounter::support_t support = atof((*iter).c_str());

    if ( (error > 0) && !(error < support) ) {
        std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": support parameter (" << support << ") is not greater than error (" << error << ")!" << std::endl;
        return false;
    }

    // Split interval.
    boost::char_separator<char> separator("-");
    boost::tokenizer<boost::char_separator<char> > intervalTokens(interval, separator);
    iter = intervalTokens.begin();

    int from = 0, to = 0;

    from = atoi((*iter).c_str());
    if ( ++iter == intervalTokens.end() )
        to = from;
    else
        to = atoi((*iter).c_str());

    if ( ! (from <= to) ) {
        std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": invalid interval " << from << "-" << to << " specified!" << std::endl;
        return false;
    }

    LossyCounterInstance* lci = new LossyCounterInstance(error, support);

    if ( lossyCounters.size() <= to ) {
        lossyCounters.resize(to + 1, NULL);
    }

    for ( size_t i = from; i <= to; ++i ) {
        if ( lossyCounters[i] != NULL ) {
            std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": Lossy Counter for phrases of length " << i << " is already defined!" << std::endl;
            return false;
        }
        lossyCounters[i] = lci;
    }

    // Set maximum phrase length accordingly:
    if ( maxPhraseLength < to )
        maxPhraseLength = to;

    return true;
}

void read_optional_params(int argc, char* argv[], int optionalParamsStart) {

    for ( int i = optionalParamsStart; i < argc; i++ ) {
        if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
            std::cerr << "Error: option --OnlyOutputSpanInfo is not supported!\n";
            exit(2);
        } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
            orientationFlag = true;
        } else if (strcmp(argv[i],"--NoTTable") == 0) {
            translationFlag = false;
        } else if(strcmp(argv[i],"--model") == 0) {
            if (i+1 >= argc) {
                std::cerr << "extract: syntax error, no model's information provided to the option --model " << std::endl;
                exit(1);
            }
            char* modelParams = argv[++i];
            const char* modelName = strtok(modelParams, "-");
            const char* modelType = strtok(NULL, "-");

            if(strcmp(modelName, "wbe") == 0) {
                wordModel = true;
                if(strcmp(modelType, "msd") == 0) {
                    wordType = REO_MSD;
                }
                else if(strcmp(modelType, "mslr") == 0) {
                    wordType = REO_MSLR;
                }
                else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) {
                    wordType = REO_MONO;
                }
                else {
                    std::cerr << "extract: syntax error, unknown reordering model type: " << modelType << std::endl;
                    exit(1);
                }
            } else if(strcmp(modelName, "phrase") == 0) {
                phraseModel = true;
                if(strcmp(modelType, "msd") == 0) {
                    phraseType = REO_MSD;
                }
                else if(strcmp(modelType, "mslr") == 0) {
                    phraseType = REO_MSLR;
                }
                else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) {
                    phraseType = REO_MONO;
                }
                else {
                    std::cerr << "extract: syntax error, unknown reordering model type: " << modelType << std::endl;
                    exit(1);
                }
            } else if(strcmp(modelName, "hier") == 0) {
                hierModel = true;
                if(strcmp(modelType, "msd") == 0) {
                    hierType = REO_MSD;
                }
                else if(strcmp(modelType, "mslr") == 0) {
                    hierType = REO_MSLR;
                }
                else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) {
                    hierType = REO_MONO;
                }
                else {
                    std::cerr << "extract: syntax error, unknown reordering model type: " << modelType << std::endl;
                    exit(1);
                }
            } else {
                std::cerr << "extract: syntax error, unknown reordering model: " << modelName << std::endl;
                exit(1);
            }

            allModelsOutputFlag = true;

        } else {
            std::cerr << "extract: syntax error, unknown option '" << std::string(argv[i]) << "'\n";
            exit(1);
        }
    }

    // default reordering model if no model selected
    // allows for the old syntax to be used
    if(orientationFlag && !allModelsOutputFlag) {
        wordModel = true;
        wordType = REO_MSD;
    }

} // end of read_optional_params()