diff options
author | Hieu Hoang <hieu@hoang.co.uk> | 2014-08-29 19:24:49 +0400 |
---|---|---|
committer | Hieu Hoang <hieu@hoang.co.uk> | 2014-08-29 19:24:49 +0400 |
commit | e6438e378fd6ca18abac3ec4d66508beba189a62 (patch) | |
tree | 932e390300987c7dd296e9a9678eed9c97e40547 /phrase-extract/extract-mixed-syntax/Main.cpp | |
parent | 379da960d1f496a5b1636e0ee80a0e90abe87b89 (diff) |
Add option to sort chart translation option after EvaluateWithSourceContext
Diffstat (limited to 'phrase-extract/extract-mixed-syntax/Main.cpp')
-rw-r--r-- | phrase-extract/extract-mixed-syntax/Main.cpp | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/phrase-extract/extract-mixed-syntax/Main.cpp b/phrase-extract/extract-mixed-syntax/Main.cpp new file mode 100644 index 000000000..2fcb06a9b --- /dev/null +++ b/phrase-extract/extract-mixed-syntax/Main.cpp @@ -0,0 +1,203 @@ +#include <iostream> +#include <cstdlib> +#include <boost/program_options.hpp> + +#include "Main.h" +#include "InputFileStream.h" +#include "OutputFileStream.h" +#include "AlignedSentence.h" +#include "AlignedSentenceSyntax.h" +#include "Parameter.h" +#include "Rules.h" + +using namespace std; + +bool g_debug = false; + +int main(int argc, char** argv) +{ + cerr << "Starting" << endl; + + Parameter params; + + namespace po = boost::program_options; + po::options_description desc("Options"); + desc.add_options() + ("help", "Print help messages") + ("MaxSpan", po::value<int>()->default_value(params.maxSpan), "Max (source) span of a rule. ie. number of words in the source") + ("MinSpan", po::value<int>()->default_value(params.minSpan), "Min (source) span of a rule.") + ("GlueGrammar", po::value<string>()->default_value(params.gluePath), "Output glue grammar to here") + ("SentenceOffset", po::value<long>()->default_value(params.sentenceOffset), "Starting sentence id. Not used") + ("GZOutput", "Compress extract files") + ("MaxNonTerm", po::value<int>()->default_value(params.maxNonTerm), "Maximum number of non-terms allowed per rule") + ("MaxHieroNonTerm", po::value<int>()->default_value(params.maxHieroNonTerm), "Maximum number of Hiero non-term. Usually, --MaxNonTerm is the normal constraint") + ("MinHoleSource", po::value<int>()->default_value(params.minHoleSource), "Minimum source span for a non-term.") + + ("SourceSyntax", "Source sentence is a parse tree") + ("TargetSyntax", "Target sentence is a parse tree") + ("MixedSyntaxType", po::value<int>()->default_value(params.mixedSyntaxType), "Hieu's Mixed syntax type. 0(default)=no mixed syntax, 1=add [X] only if no syntactic label. 2=add [X] everywhere") + ("MultiLabel", po::value<int>()->default_value(params.multiLabel), "What to do with multiple labels on the same span. 0(default)=keep them all, 1=keep only top-most, 2=keep only bottom-most") + ("HieroSourceLHS", "Always use Hiero source LHS? Default = 0") + ("MaxSpanFreeNonTermSource", po::value<int>()->default_value(params.maxSpanFreeNonTermSource), "Max number of words covered by beginning/end NT. Default = 0 (no limit)") + ("NoNieceTerminal", "Don't extract rule if 1 of the non-term covers the same word as 1 of the terminals") + ("MaxScope", po::value<int>()->default_value(params.maxScope), "maximum scope (see Hopkins and Langmead (2010)). Default is HIGH") + ("MinScope", po::value<int>()->default_value(params.minScope), "min scope.") + + ("SpanLength", "Property - span length of RHS each non-term") + + ("NonTermContext", "Property - (source) left and right, inside and outside words of each non-term ") + ("NonTermContextTarget", "Property - (target) left and right, inside and outside words of each non-term") + ("NonTermContextFactor", po::value<int>()->default_value(params.nonTermContextFactor), "Factor to use for non-term context property.") + + ("NumSourceFactors", po::value<int>()->default_value(params.numSourceFactors), "Number of source factors.") + ("NumTargetFactors", po::value<int>()->default_value(params.numTargetFactors), "Number of target factors.") + + ("HieroNonTerm", po::value<string>()->default_value(params.hieroNonTerm), "Hiero non-terminal label, including bracket") + ("ScopeSpan", po::value<string>()->default_value(params.scopeSpanStr), "Min and max span for rules of each scope. Format is min,max:min,max...") + + ("NonTermConsecSource", "Allow consecutive non-terms on the source side"); + + + po::variables_map vm; + try + { + po::store(po::parse_command_line(argc, argv, desc), + vm); // can throw + + /** --help option + */ + if ( vm.count("help") || argc < 5 ) + { + std::cout << argv[0] << " target source alignment [options...]" << std::endl + << desc << std::endl; + return EXIT_SUCCESS; + } + + po::notify(vm); // throws on error, so do after help in case + // there are any problems + } + catch(po::error& e) + { + std::cerr << "ERROR: " << e.what() << std::endl << std::endl; + std::cerr << desc << std::endl; + return EXIT_FAILURE; + } + + if (vm.count("MaxSpan")) params.maxSpan = vm["MaxSpan"].as<int>(); + if (vm.count("MinSpan")) params.minSpan = vm["MinSpan"].as<int>(); + if (vm.count("GZOutput")) params.gzOutput = true; + if (vm.count("GlueGrammar")) params.gluePath = vm["GlueGrammar"].as<string>(); + if (vm.count("SentenceOffset")) params.sentenceOffset = vm["SentenceOffset"].as<long>(); + if (vm.count("MaxNonTerm")) params.maxNonTerm = vm["MaxNonTerm"].as<int>(); + if (vm.count("MaxHieroNonTerm")) params.maxHieroNonTerm = vm["MaxHieroNonTerm"].as<int>(); + if (vm.count("MinHoleSource")) params.minHoleSource = vm["MinHoleSource"].as<int>(); + + if (vm.count("SourceSyntax")) params.sourceSyntax = true; + if (vm.count("TargetSyntax")) params.targetSyntax = true; + if (vm.count("MixedSyntaxType")) params.mixedSyntaxType = vm["MixedSyntaxType"].as<int>(); + if (vm.count("MultiLabel")) params.multiLabel = vm["MultiLabel"].as<int>(); + if (vm.count("HieroSourceLHS")) params.hieroSourceLHS = true; + if (vm.count("MaxSpanFreeNonTermSource")) params.maxSpanFreeNonTermSource = vm["MaxSpanFreeNonTermSource"].as<int>(); + if (vm.count("NoNieceTerminal")) params.nieceTerminal = false; + if (vm.count("MaxScope")) params.maxScope = vm["MaxScope"].as<int>(); + if (vm.count("MinScope")) params.minScope = vm["MinScope"].as<int>(); + + // properties + if (vm.count("SpanLength")) params.spanLength = true; + if (vm.count("NonTermContext")) params.nonTermContext = true; + if (vm.count("NonTermContextTarget")) params.nonTermContextTarget = true; + if (vm.count("NonTermContextFactor")) params.nonTermContextFactor = vm["NonTermContextFactor"].as<int>(); + + if (vm.count("NumSourceFactors")) params.numSourceFactors = vm["NumSourceFactors"].as<int>(); + if (vm.count("NumTargetFactors")) params.numTargetFactors = vm["NumTargetFactors"].as<int>(); + + if (vm.count("HieroNonTerm")) params.hieroNonTerm = vm["HieroNonTerm"].as<string>(); + if (vm.count("ScopeSpan")) { + params.SetScopeSpan(vm["ScopeSpan"].as<string>()); + } + + if (vm.count("NonTermConsecSource")) params.nonTermConsecSource = true; + + // input files; + string pathTarget = argv[1]; + string pathSource = argv[2]; + string pathAlignment = argv[3]; + + string pathExtract = argv[4]; + string pathExtractInv = pathExtract + ".inv"; + if (params.gzOutput) { + pathExtract += ".gz"; + pathExtractInv += ".gz"; + } + + Moses::InputFileStream strmTarget(pathTarget); + Moses::InputFileStream strmSource(pathSource); + Moses::InputFileStream strmAlignment(pathAlignment); + Moses::OutputFileStream extractFile(pathExtract); + Moses::OutputFileStream extractInvFile(pathExtractInv); + + + // MAIN LOOP + int lineNum = 1; + string lineTarget, lineSource, lineAlignment; + while (getline(strmTarget, lineTarget)) { + if (lineNum % 10000 == 0) { + cerr << lineNum << " "; + } + + bool success; + success = getline(strmSource, lineSource); + if (!success) { + throw "Couldn't read source"; + } + success = getline(strmAlignment, lineAlignment); + if (!success) { + throw "Couldn't read alignment"; + } + + /* + cerr << "lineTarget=" << lineTarget << endl; + cerr << "lineSource=" << lineSource << endl; + cerr << "lineAlignment=" << lineAlignment << endl; + */ + + AlignedSentence *alignedSentence; + + if (params.sourceSyntax || params.targetSyntax) { + alignedSentence = new AlignedSentenceSyntax(lineNum, lineSource, lineTarget, lineAlignment); + } + else { + alignedSentence = new AlignedSentence(lineNum, lineSource, lineTarget, lineAlignment); + } + + alignedSentence->Create(params); + //cerr << alignedSentence->Debug(); + + Rules rules(*alignedSentence); + rules.Extend(params); + rules.Consolidate(params); + //cerr << rules.Debug(); + + rules.Output(extractFile, true, params); + rules.Output(extractInvFile, false, params); + + delete alignedSentence; + + ++lineNum; + } + + if (!params.gluePath.empty()) { + Moses::OutputFileStream glueFile(params.gluePath); + CreateGlueGrammar(glueFile); + } + + cerr << "Finished" << endl; +} + +void CreateGlueGrammar(Moses::OutputFileStream &glueFile) +{ + glueFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl + << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl + << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl; + +} |