diff options
Diffstat (limited to 'scripts/training/eppex/shared.cpp')
-rw-r--r-- | scripts/training/eppex/shared.cpp | 194 |
1 files changed, 0 insertions, 194 deletions
diff --git a/scripts/training/eppex/shared.cpp b/scripts/training/eppex/shared.cpp deleted file mode 100644 index 670df1c0f..000000000 --- a/scripts/training/eppex/shared.cpp +++ /dev/null @@ -1,194 +0,0 @@ -/** - * Implementation of functionality shared between counter, eppex and - * (not yet finished) memscoring eppex. - * - * (C) Moses: http://www.statmt.org/moses/ - * (C) Ceslav Przywara, UFAL MFF UK, 2011 - * - * $Id$ - */ - -#include <string.h> -#include <boost/tokenizer.hpp> -#include <iostream> - -#include "typedefs.h" -#include "phrase-extract.h" -#include "shared.h" - - -std::string get_lossy_counting_params_format(void) { - return "\n" - "You may specify separate Lossy Counter (LC) for each phrase length or\n" - "use shared LC for all phrase pairs with length from given inclusive interval.\n" - "Every LC is defined by parameter in form phrase-length:error:support, where:\n" - " phrase-length ... a single number (eg. 2) or interval (eg. 2-4)\n" - " error ... error parameter for lossy counting\n" - " support ... support parameter for lossy counting\n" - "\n" - "Example of LC params: 1:0:0 2-4:1e-7:4e-7 5-7:2e-8:8e-8\n" - " - phrase pairs of length 1 will NOT be pruned\n" - " - phrase pairs of length from 2 to 4 (inclusive) will be pruned altogether by LC\n" - " with parameters support=4e-7 and error=1e-7\n" - " - phrase pairs of length from 5 to 7 (inclusive) will be pruned altogether by LC\n" - " with parameters support=8e-8 and error=2e-8\n" - " - max phrase length extracted will be set to 7\n" - "\n" - "Note: there has to be Lossy Counter defined for every phrase pair length\n" - "up to the maximum phrase length! Following will not work: 1:0:0 5-7:2e-8:8e-8\n" - "\n" - "To count phrase pairs by their length a separate program (counter) may be used.\n" - "\n" - ; -} - -bool parse_lossy_counting_params(const std::string& param) { - - // See: http://www.boost.org/doc/libs/1_42_0/libs/tokenizer/char_separator.htm - boost::char_separator<char> separators(",:"); - boost::tokenizer<boost::char_separator<char> > tokens(param, separators); - boost::tokenizer<boost::char_separator<char> >::iterator iter = tokens.begin(); - - std::string interval = *iter; - - if ( ++iter == tokens.end() ) { - std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": invalid format, missing error and support parameters specification!" << std::endl; - return false; - } - PhrasePairsLossyCounter::error_t error = atof((*iter).c_str()); - - if ( ++iter == tokens.end() ) { - std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": invalid format, missing support parameter specification!" << std::endl; - return false; - } - PhrasePairsLossyCounter::support_t support = atof((*iter).c_str()); - - if ( (error > 0) && !(error < support) ) { - std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": support parameter (" << support << ") is not greater than error (" << error << ")!" << std::endl; - return false; - } - - // Split interval. - boost::char_separator<char> separator("-"); - boost::tokenizer<boost::char_separator<char> > intervalTokens(interval, separator); - iter = intervalTokens.begin(); - - int from = 0, to = 0; - - from = atoi((*iter).c_str()); - if ( ++iter == intervalTokens.end() ) - to = from; - else - to = atoi((*iter).c_str()); - - if ( ! (from <= to) ) { - std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": invalid interval " << from << "-" << to << " specified!" << std::endl; - return false; - } - - LossyCounterInstance* lci = new LossyCounterInstance(error, support); - - if ( lossyCounters.size() <= to ) { - lossyCounters.resize(to + 1, NULL); - } - - for ( size_t i = from; i <= to; ++i ) { - if ( lossyCounters[i] != NULL ) { - std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": Lossy Counter for phrases of length " << i << " is already defined!" << std::endl; - return false; - } - lossyCounters[i] = lci; - } - - // Set maximum phrase length accordingly: - if ( maxPhraseLength < to ) - maxPhraseLength = to; - - return true; -} - -void read_optional_params(int argc, char* argv[], int optionalParamsStart) { - - for ( int i = optionalParamsStart; i < argc; i++ ) { - if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { - std::cerr << "Error: option --OnlyOutputSpanInfo is not supported!\n"; - exit(2); - } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { - orientationFlag = true; - } else if (strcmp(argv[i],"--NoTTable") == 0) { - translationFlag = false; - } else if(strcmp(argv[i],"--model") == 0) { - if (i+1 >= argc) { - std::cerr << "extract: syntax error, no model's information provided to the option --model " << std::endl; - exit(1); - } - char* modelParams = argv[++i]; - const char* modelName = strtok(modelParams, "-"); - const char* modelType = strtok(NULL, "-"); - - if(strcmp(modelName, "wbe") == 0) { - wordModel = true; - if(strcmp(modelType, "msd") == 0) { - wordType = REO_MSD; - } - else if(strcmp(modelType, "mslr") == 0) { - wordType = REO_MSLR; - } - else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) { - wordType = REO_MONO; - } - else { - std::cerr << "extract: syntax error, unknown reordering model type: " << modelType << std::endl; - exit(1); - } - } else if(strcmp(modelName, "phrase") == 0) { - phraseModel = true; - if(strcmp(modelType, "msd") == 0) { - phraseType = REO_MSD; - } - else if(strcmp(modelType, "mslr") == 0) { - phraseType = REO_MSLR; - } - else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) { - phraseType = REO_MONO; - } - else { - std::cerr << "extract: syntax error, unknown reordering model type: " << modelType << std::endl; - exit(1); - } - } else if(strcmp(modelName, "hier") == 0) { - hierModel = true; - if(strcmp(modelType, "msd") == 0) { - hierType = REO_MSD; - } - else if(strcmp(modelType, "mslr") == 0) { - hierType = REO_MSLR; - } - else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) { - hierType = REO_MONO; - } - else { - std::cerr << "extract: syntax error, unknown reordering model type: " << modelType << std::endl; - exit(1); - } - } else { - std::cerr << "extract: syntax error, unknown reordering model: " << modelName << std::endl; - exit(1); - } - - allModelsOutputFlag = true; - - } else { - std::cerr << "extract: syntax error, unknown option '" << std::string(argv[i]) << "'\n"; - exit(1); - } - } - - // default reordering model if no model selected - // allows for the old syntax to be used - if(orientationFlag && !allModelsOutputFlag) { - wordModel = true; - wordType = REO_MSD; - } - -} // end of read_optional_params() |