From 62d37fa2b66bc6e28839ff054dcffd259a9088fb Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 12 Nov 2012 14:17:48 +0000 Subject: Refactor phrase-extract/Jamfile --- phrase-extract/extract-main.cpp | 769 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 769 insertions(+) create mode 100644 phrase-extract/extract-main.cpp (limited to 'phrase-extract/extract-main.cpp') diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp new file mode 100644 index 000000000..8749f5059 --- /dev/null +++ b/phrase-extract/extract-main.cpp @@ -0,0 +1,769 @@ +/* + * extract.cpp + * Modified by: Rohit Gupta CDAC, Mumbai, India + * on July 15, 2012 to implement parallel processing + * Modified by: Nadi Tomeh - LIMSI/CNRS + * Machine Translation Marathon 2010, Dublin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "SafeGetline.h" +#include "SentenceAlignment.h" +#include "tables-core.h" +#include "InputFileStream.h" +#include "OutputFileStream.h" +#include "PhraseExtractionOptions.h" + +using namespace std; +using namespace MosesTraining; + +namespace MosesTraining { + + +const long int LINE_MAX_LENGTH = 500000 ; + + +// HPhraseVertex represents a point in the alignment matrix +typedef pair HPhraseVertex; + +// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix: +// bottom-left and top-right +typedef pair HPhrase; + +// HPhraseVector is a vector of HPhrases +typedef vector < HPhrase > HPhraseVector; + +// SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning +// The key of the map is the English index and the value is a set of the source ones +typedef map > HSentenceVertices; + + REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, + int, int, int, int, int, int, int, + bool (*)(int, int), bool (*)(int, int)); + REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, + int, int, int, int, int, int, int, + bool (*)(int, int), bool (*)(int, int), + const HSentenceVertices &, const HSentenceVertices &); + REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, + int, int, int, int, int, int, int, + bool (*)(int, int), bool (*)(int, int), + const HSentenceVertices &, const HSentenceVertices &, + const HSentenceVertices &, const HSentenceVertices &, + REO_POS); + + void insertVertex(HSentenceVertices &, int, int); + void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, + int, int, int, int); + string getOrientString(REO_POS, REO_MODEL_TYPE); + + bool ge(int, int); + bool le(int, int); + bool lt(int, int); + + bool isAligned (SentenceAlignment &, int, int); + int sentenceOffset = 0; + +} + +namespace MosesTraining{ + +class ExtractTask +{ +public: + ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation): + m_sentence(sentence), + m_options(initoptions), + m_extractFile(extractFile), + m_extractFileInv(extractFileInv), + m_extractFileOrientation(extractFileOrientation){} +void Run(); +private: + vector< string > m_extractedPhrases; + vector< string > m_extractedPhrasesInv; + vector< string > m_extractedPhrasesOri; + vector< string > m_extractedPhrasesSid; + void extractBase(SentenceAlignment &); + void extract(SentenceAlignment &); + void addPhrase(SentenceAlignment &, int, int, int, int, string &); + void writePhrasesToFile(); + + SentenceAlignment &m_sentence; + const PhraseExtractionOptions &m_options; + Moses::OutputFileStream &m_extractFile; + Moses::OutputFileStream &m_extractFileInv; + Moses::OutputFileStream &m_extractFileOrientation; +}; +} + +int main(int argc, char* argv[]) +{ + cerr << "PhraseExtract v1.4, written by Philipp Koehn\n" + << "phrase extraction from an aligned parallel corpus\n"; + + if (argc < 6) { + cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] "; + cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n ]\n"; + exit(1); + } + + Moses::OutputFileStream extractFile; + Moses::OutputFileStream extractFileInv; + Moses::OutputFileStream extractFileOrientation; + const char* const &fileNameE = argv[1]; + const char* const &fileNameF = argv[2]; + const char* const &fileNameA = argv[3]; + const string fileNameExtract = string(argv[4]); + PhraseExtractionOptions options(atoi(argv[5])); + + for(int i=6; i= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { + cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; + exit(1); + } + sentenceOffset = atoi(argv[++i]); + } else if (strcmp(argv[i], "--GZOutput") == 0) { + options.initGzOutput(true); + } else if(strcmp(argv[i],"--model") == 0) { + if (i+1 >= argc) { + cerr << "extract: syntax error, no model's information provided to the option --model " << endl; + exit(1); + } + char* modelParams = argv[++i]; + char* modelName = strtok(modelParams, "-"); + char* modelType = strtok(NULL, "-"); + + // REO_MODEL_TYPE intModelType; + + if(strcmp(modelName, "wbe") == 0) { + options.initWordModel(true); + if(strcmp(modelType, "msd") == 0) + options.initWordType(REO_MSD); + else if(strcmp(modelType, "mslr") == 0) + options.initWordType(REO_MSLR); + else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) + options.initWordType(REO_MONO); + else { + cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; + exit(1); + } + } else if(strcmp(modelName, "phrase") == 0) { + options.initPhraseModel(true); + if(strcmp(modelType, "msd") == 0) + options.initPhraseType(REO_MSD); + else if(strcmp(modelType, "mslr") == 0) + options.initPhraseType(REO_MSLR); + else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) + options.initPhraseType(REO_MONO); + else { + cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; + exit(1); + } + } else if(strcmp(modelName, "hier") == 0) { + options.initHierModel(true); + if(strcmp(modelType, "msd") == 0) + options.initHierType(REO_MSD); + else if(strcmp(modelType, "mslr") == 0) + options.initHierType(REO_MSLR); + else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) + options.initHierType(REO_MONO); + else { + cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; + exit(1); + } + } else { + cerr << "extract: syntax error, unknown reordering model: " << modelName << endl; + exit(1); + } + + options.initAllModelsOutputFlag(true); + + } else { + cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; + exit(1); + } + } + + // default reordering model if no model selected + // allows for the old syntax to be used + if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) { + options.initWordModel(true); + options.initWordType(REO_MSD); + } + + // open input files + Moses::InputFileStream eFile(fileNameE); + Moses::InputFileStream fFile(fileNameF); + Moses::InputFileStream aFile(fileNameA); + + istream *eFileP = &eFile; + istream *fFileP = &fFile; + istream *aFileP = &aFile; + + // open output files + if (options.isTranslationFlag()) { + string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":""); + extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str()); + extractFileInv.Open(fileNameExtractInv.c_str()); + } + if (options.isOrientationFlag()) { + string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":""); + extractFileOrientation.Open(fileNameExtractOrientation.c_str()); + } + + int i = sentenceOffset; + while(true) { + i++; + if (i%10000 == 0) cerr << "." << flush; + char englishString[LINE_MAX_LENGTH]; + char foreignString[LINE_MAX_LENGTH]; + char alignmentString[LINE_MAX_LENGTH]; + SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__); + if (eFileP->eof()) break; + SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__); + SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); + SentenceAlignment sentence; + // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; + //az: output src, tgt, and alingment line + if (options.isOnlyOutputSpanInfo()) { + cout << "LOG: SRC: " << foreignString << endl; + cout << "LOG: TGT: " << englishString << endl; + cout << "LOG: ALT: " << alignmentString << endl; + cout << "LOG: PHRASES_BEGIN:" << endl; + } + if (sentence.create( englishString, foreignString, alignmentString, i, false)) { + ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation); + task->Run(); + delete task; + + } + if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases + } + + eFile.Close(); + fFile.Close(); + aFile.Close(); + + //az: only close if we actually opened it + if (!options.isOnlyOutputSpanInfo()) { + if (options.isTranslationFlag()) { + extractFile.Close(); + extractFileInv.Close(); + + } + if (options.isOrientationFlag()){ + extractFileOrientation.Close(); + } + } +} + +namespace MosesTraining +{ +void ExtractTask::Run() { + extract(m_sentence); + writePhrasesToFile(); + m_extractedPhrases.clear(); + m_extractedPhrasesInv.clear(); + m_extractedPhrasesOri.clear(); + m_extractedPhrasesSid.clear(); + +} + +void ExtractTask::extract(SentenceAlignment &sentence) +{ + int countE = sentence.target.size(); + int countF = sentence.source.size(); + + HPhraseVector inboundPhrases; + + HSentenceVertices inTopLeft; + HSentenceVertices inTopRight; + HSentenceVertices inBottomLeft; + HSentenceVertices inBottomRight; + + HSentenceVertices outTopLeft; + HSentenceVertices outTopRight; + HSentenceVertices outBottomLeft; + HSentenceVertices outBottomRight; + + HSentenceVertices::const_iterator it; + + bool relaxLimit = m_options.isHierModel(); + bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel(); + + // check alignments for target phrase startE...endE + // loop over extracted phrases which are compatible with the word-alignments + for(int startE=0; startE usedF = sentence.alignedCountS; + for(int ei=startE; ei<=endE; ei++) { + for(size_t i=0; imaxF) { + maxF = fi; + } + usedF[ fi ]--; + } + } + + if (maxF >= 0 && // aligned to any source words at all + (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits + + // check if source words are aligned to out of bound target words + bool out_of_bounds = false; + for(int fi=minF; fi<=maxF && !out_of_bounds; fi++) + if (usedF[fi]>0) { + // cout << "ouf of bounds: " << fi << "\n"; + out_of_bounds = true; + } + + // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; + if (!out_of_bounds) { + // start point of source phrase may retreat over unaligned + for(int startF=minF; + (startF>=0 && + (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit + (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned + startF--) + // end point of source phrase may advance over unaligned + for(int endF=maxF; + (endFsecond.find(startF-unit) != it->second.end())) + return LEFT; + if(modelType == REO_MONO) + return UNKNOWN; + if((!connectedLeftTop && connectedRightTop) || + ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end())) + return RIGHT; + if(modelType == REO_MSD) + return UNKNOWN; + connectedLeftTop = false; + for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) + if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() && + it->second.find(indexF) != it->second.end()) + return DRIGHT; + connectedRightTop = false; + for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) + if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() && + it->second.find(indexF) != it->second.end()) + return DLEFT; + return UNKNOWN; +} + +// to be called with countF-1 instead of countF +REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, + bool connectedLeftTop, bool connectedRightTop, + int startF, int endF, int startE, int endE, int countF, int zero, int unit, + bool (*ge)(int, int), bool (*lt)(int, int), + const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft, + const HSentenceVertices & outBottomRight, const HSentenceVertices & outBottomLeft, + REO_POS phraseOrient) +{ + + HSentenceVertices::const_iterator it; + + if(phraseOrient == LEFT || + (connectedLeftTop && !connectedRightTop) || + // (startE == 0 && startF == 0) || + //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) || + ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() && + it->second.find(startF-unit) != it->second.end()) || + ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() && + it->second.find(startF-unit) != it->second.end())) + return LEFT; + if(modelType == REO_MONO) + return UNKNOWN; + if(phraseOrient == RIGHT || + (!connectedLeftTop && connectedRightTop) || + ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && + it->second.find(endF + unit) != it->second.end()) || + ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() && + it->second.find(endF + unit) != it->second.end())) + return RIGHT; + if(modelType == REO_MSD) + return UNKNOWN; + if(phraseOrient != UNKNOWN) + return phraseOrient; + connectedLeftTop = false; + for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) { + if((connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() && + it->second.find(indexF) != it->second.end()) || + (connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() && + it->second.find(indexF) != it->second.end())) + return DRIGHT; + } + connectedRightTop = false; + for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) { + if((connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() && + it->second.find(indexF) != it->second.end()) || + (connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() && + it->second.find(indexF) != it->second.end())) + return DLEFT; + } + return UNKNOWN; +} + +bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) +{ + if (ei == -1 && fi == -1) + return true; + if (ei <= -1 || fi <= -1) + return false; + if ((size_t)ei == sentence.target.size() && (size_t)fi == sentence.source.size()) + return true; + if ((size_t)ei >= sentence.target.size() || (size_t)fi >= sentence.source.size()) + return false; + for(size_t i=0; i= second; +} + +bool le(int first, int second) +{ + return first <= second; +} + +bool lt(int first, int second) +{ + return first < second; +} + +void insertVertex( HSentenceVertices & corners, int x, int y ) +{ + set tmp; + tmp.insert(x); + pair< HSentenceVertices::iterator, bool > ret = corners.insert( pair > (y, tmp) ); + if(ret.second == false) { + ret.first->second.insert(x); + } +} + +void insertPhraseVertices( + HSentenceVertices & topLeft, + HSentenceVertices & topRight, + HSentenceVertices & bottomLeft, + HSentenceVertices & bottomRight, + int startF, int startE, int endF, int endE) +{ + + insertVertex(topLeft, startF, startE); + insertVertex(topRight, endF, startE); + insertVertex(bottomLeft, startF, endE); + insertVertex(bottomRight, endF, endE); +} + +string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType) +{ + switch(orient) { + case LEFT: + return "mono"; + break; + case RIGHT: + return "swap"; + break; + case DRIGHT: + return "dright"; + break; + case DLEFT: + return "dleft"; + break; + case UNKNOWN: + switch(modelType) { + case REO_MONO: + return "nomono"; + break; + case REO_MSD: + return "other"; + break; + case REO_MSLR: + return "dright"; + break; + } + break; + } + return ""; +} + +void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo) +{ + // source + // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; + ostringstream outextractstr; + ostringstream outextractstrInv; + ostringstream outextractstrOrientation; + + if (m_options.isOnlyOutputSpanInfo()) { + cout << startF << " " << endF << " " << startE << " " << endE << endl; + return; + } + +for(int fi=startF; fi<=endF; fi++) { + if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " "; + if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " "; + } + if (m_options.isTranslationFlag()) outextractstr << "||| "; + if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + + // target + for(int ei=startE; ei<=endE; ei++) { + if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " "; + if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " "; + if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " "; + } + if (m_options.isTranslationFlag()) outextractstr << "|||"; + if (m_options.isTranslationFlag()) outextractstrInv << "||| "; + if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + + // source (for inverse) + + if (m_options.isTranslationFlag()) { + for(int fi=startF; fi<=endF; fi++) + outextractstrInv << sentence.source[fi] << " "; + outextractstrInv << "|||"; + } + // alignment + if (m_options.isTranslationFlag()) { + for(int ei=startE; ei<=endE; ei++) { + for(unsigned int i=0; i::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){ + outextractFile<data(); + } + for(vector::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){ + outextractFileInv<data(); + } + for(vector::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){ + outextractFileOrientation<data(); + } + + m_extractFile << outextractFile.str(); + m_extractFileInv << outextractFileInv.str(); + m_extractFileOrientation << outextractFileOrientation.str(); +} + +// if proper conditioning, we need the number of times a source phrase occured + +void ExtractTask::extractBase( SentenceAlignment &sentence ) +{ + ostringstream outextractFile; + ostringstream outextractFileInv; + + int countF = sentence.source.size(); + for(int startF=0; startF