From 7138056b8fd60b4572f016f038360dd5fb19a39c Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Sat, 7 Sep 2013 23:04:01 +0200 Subject: flexibility scores --- phrase-extract/extract-main.cpp | 96 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 3 deletions(-) (limited to 'phrase-extract/extract-main.cpp') diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 4804f83b8..5bc018173 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -80,6 +80,8 @@ int sentenceOffset = 0; std::vector Tokenize(const std::string& str, const std::string& delimiters = " \t"); +bool flexScoreFlag = false; + } namespace MosesTraining @@ -88,18 +90,22 @@ namespace MosesTraining class ExtractTask { public: - ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation): + ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation, Moses::OutputFileStream &extractFileContext, Moses::OutputFileStream &extractFileContextInv): m_sentence(sentence), m_options(initoptions), m_extractFile(extractFile), m_extractFileInv(extractFileInv), - m_extractFileOrientation(extractFileOrientation) {} + m_extractFileOrientation(extractFileOrientation), + m_extractFileContext(extractFileContext), + m_extractFileContextInv(extractFileContextInv) {} void Run(); private: vector< string > m_extractedPhrases; vector< string > m_extractedPhrasesInv; vector< string > m_extractedPhrasesOri; vector< string > m_extractedPhrasesSid; + vector< string > m_extractedPhrasesContext; + vector< string > m_extractedPhrasesContextInv; void extractBase(SentenceAlignment &); void extract(SentenceAlignment &); void addPhrase(SentenceAlignment &, int, int, int, int, string &); @@ -112,6 +118,8 @@ private: Moses::OutputFileStream &m_extractFile; Moses::OutputFileStream &m_extractFileInv; Moses::OutputFileStream &m_extractFileOrientation; + Moses::OutputFileStream &m_extractFileContext; + Moses::OutputFileStream &m_extractFileContextInv; }; } @@ -129,6 +137,8 @@ int main(int argc, char* argv[]) Moses::OutputFileStream extractFile; Moses::OutputFileStream extractFileInv; Moses::OutputFileStream extractFileOrientation; + Moses::OutputFileStream extractFileContext; + Moses::OutputFileStream extractFileContextInv; const char* const &fileNameE = argv[1]; const char* const &fileNameF = argv[2]; const char* const &fileNameA = argv[3]; @@ -140,6 +150,8 @@ int main(int argc, char* argv[]) options.initOnlyOutputSpanInfo(true); } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { options.initOrientationFlag(true); + } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { + options.initFlexScoreFlag(true); } else if (strcmp(argv[i],"--NoTTable") == 0) { options.initTranslationFlag(false); } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) { @@ -254,8 +266,15 @@ int main(int argc, char* argv[]) string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":""); extractFileOrientation.Open(fileNameExtractOrientation.c_str()); } + if (options.isFlexScoreFlag()) { + string fileNameExtractContext = fileNameExtract + ".context" + (options.isGzOutput()?".gz":""); + string fileNameExtractContextInv = fileNameExtract + ".context.inv" + (options.isGzOutput()?".gz":""); + extractFileContext.Open(fileNameExtractContext.c_str()); + extractFileContextInv.Open(fileNameExtractContextInv.c_str()); + } int i = sentenceOffset; + while(true) { i++; if (i%10000 == 0) cerr << "." << flush; @@ -280,7 +299,7 @@ int main(int argc, char* argv[]) cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { - ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation); + ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileContext, extractFileContextInv); task->Run(); delete task; @@ -302,6 +321,11 @@ int main(int argc, char* argv[]) if (options.isOrientationFlag()) { extractFileOrientation.Close(); } + + if (options.isFlexScoreFlag()) { + extractFileContext.Close(); + extractFileContextInv.Close(); + } } } @@ -315,6 +339,8 @@ void ExtractTask::Run() m_extractedPhrasesInv.clear(); m_extractedPhrasesOri.clear(); m_extractedPhrasesSid.clear(); + m_extractedPhrasesContext.clear(); + m_extractedPhrasesContextInv.clear(); } @@ -680,6 +706,8 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, ostringstream outextractstr; ostringstream outextractstrInv; ostringstream outextractstrOrientation; + ostringstream outextractstrContext; + ostringstream outextractstrContextInv; if (m_options.isOnlyOutputSpanInfo()) { cout << startF << " " << endF << " " << startE << " " << endE << endl; @@ -693,19 +721,25 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, for(int fi=startF; fi<=endF; fi++) { if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " "; if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " "; + if (m_options.isFlexScoreFlag()) outextractstrContext << sentence.source[fi] << " "; } if (m_options.isTranslationFlag()) outextractstr << "||| "; if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + if (m_options.isFlexScoreFlag()) outextractstrContext << "||| "; // target for(int ei=startE; ei<=endE; ei++) { if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " "; if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " "; if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " "; + if (m_options.isFlexScoreFlag()) outextractstrContext << sentence.target[ei] << " "; + if (m_options.isFlexScoreFlag()) outextractstrContextInv << sentence.target[ei] << " "; } if (m_options.isTranslationFlag()) outextractstr << "|||"; if (m_options.isTranslationFlag()) outextractstrInv << "||| "; if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + if (m_options.isFlexScoreFlag()) outextractstrContext << "||| "; + if (m_options.isFlexScoreFlag()) outextractstrContextInv << "||| "; // source (for inverse) @@ -714,6 +748,12 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, outextractstrInv << sentence.source[fi] << " "; outextractstrInv << "|||"; } + if (m_options.isFlexScoreFlag()) { + for(int fi=startF; fi<=endF; fi++) + outextractstrContextInv << sentence.source[fi] << " "; + outextractstrContextInv << "|||"; + } + // alignment if (m_options.isTranslationFlag()) { for(int ei=startE; ei<=endE; ei++) { @@ -743,6 +783,46 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, } + + // generate two lines for every extracted phrase: + // once with left, once with right context + if (m_options.isFlexScoreFlag()) { + + string strContext = outextractstrContext.str(); + string strContextInv = outextractstrContextInv.str(); + + ostringstream outextractstrContextRight(strContext, ostringstream::app); + ostringstream outextractstrContextRightInv(strContextInv, ostringstream::app); + + // write context to left + outextractstrContext << "< "; + if (startF == 0) outextractstrContext << ""; + else outextractstrContext << sentence.source[startF-1]; + + outextractstrContextInv << " < "; + if (startE == 0) outextractstrContextInv << ""; + else outextractstrContextInv << sentence.target[startE-1]; + + // write context to right + outextractstrContextRight << "> "; + if (endF+1 == sentence.source.size()) outextractstrContextRight << ""; + else outextractstrContextRight << sentence.source[endF+1]; + + outextractstrContextRightInv << " > "; + if (endE+1 == sentence.target.size()) outextractstrContextRightInv << ""; + else outextractstrContextRightInv << sentence.target[endE+1]; + + outextractstrContext << "\n"; + outextractstrContextInv << "\n"; + outextractstrContextRight << "\n"; + outextractstrContextRightInv << "\n"; + + m_extractedPhrasesContext.push_back(outextractstrContext.str()); + m_extractedPhrasesContextInv.push_back(outextractstrContextInv.str()); + m_extractedPhrasesContext.push_back(outextractstrContextRight.str()); + m_extractedPhrasesContextInv.push_back(outextractstrContextRightInv.str()); + } + if (m_options.isTranslationFlag()) outextractstr << "\n"; if (m_options.isTranslationFlag()) outextractstrInv << "\n"; if (m_options.isOrientationFlag()) outextractstrOrientation << "\n"; @@ -760,6 +840,8 @@ void ExtractTask::writePhrasesToFile() ostringstream outextractFile; ostringstream outextractFileInv; ostringstream outextractFileOrientation; + ostringstream outextractFileContext; + ostringstream outextractFileContextInv; for(vector::const_iterator phrase=m_extractedPhrases.begin(); phrase!=m_extractedPhrases.end(); phrase++) { outextractFile<data(); @@ -770,10 +852,18 @@ void ExtractTask::writePhrasesToFile() for(vector::const_iterator phrase=m_extractedPhrasesOri.begin(); phrase!=m_extractedPhrasesOri.end(); phrase++) { outextractFileOrientation<data(); } + for(vector::const_iterator phrase=m_extractedPhrasesContext.begin();phrase!=m_extractedPhrasesContext.end();phrase++){ + outextractFileContext<data(); + } + for(vector::const_iterator phrase=m_extractedPhrasesContextInv.begin();phrase!=m_extractedPhrasesContextInv.end();phrase++){ + outextractFileContextInv<data(); + } m_extractFile << outextractFile.str(); m_extractFileInv << outextractFileInv.str(); m_extractFileOrientation << outextractFileOrientation.str(); + m_extractFileContext << outextractFileContext.str(); + m_extractFileContextInv << outextractFileContextInv.str(); } // if proper conditioning, we need the number of times a source phrase occured -- cgit v1.2.3