Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-11-12 18:17:48 +0400
committerKenneth Heafield <github@kheafield.com>2012-11-12 18:17:48 +0400
commit62d37fa2b66bc6e28839ff054dcffd259a9088fb (patch)
treec578252c0763c81e2583e1a80d5996ea9c9f8bce /phrase-extract/extract-main.cpp
parent4f8f864650c955e65536328bd70f385976ce9063 (diff)
Refactor phrase-extract/Jamfile
Diffstat (limited to 'phrase-extract/extract-main.cpp')
-rw-r--r--phrase-extract/extract-main.cpp769
1 files changed, 769 insertions, 0 deletions
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
new file mode 100644
index 000000000..8749f5059
--- /dev/null
+++ b/phrase-extract/extract-main.cpp
@@ -0,0 +1,769 @@
+/*
+ * extract.cpp
+ * Modified by: Rohit Gupta CDAC, Mumbai, India
+ * on July 15, 2012 to implement parallel processing
+ * Modified by: Nadi Tomeh - LIMSI/CNRS
+ * Machine Translation Marathon 2010, Dublin
+ */
+
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+#include <assert.h>
+#include <cstring>
+#include <sstream>
+#include <map>
+#include <set>
+#include <vector>
+
+#include "SafeGetline.h"
+#include "SentenceAlignment.h"
+#include "tables-core.h"
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+#include "PhraseExtractionOptions.h"
+
+using namespace std;
+using namespace MosesTraining;
+
+namespace MosesTraining {
+
+
+const long int LINE_MAX_LENGTH = 500000 ;
+
+
+// HPhraseVertex represents a point in the alignment matrix
+typedef pair <int, int> HPhraseVertex;
+
+// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix:
+// bottom-left and top-right
+typedef pair<HPhraseVertex, HPhraseVertex> HPhrase;
+
+// HPhraseVector is a vector of HPhrases
+typedef vector < HPhrase > HPhraseVector;
+
+// SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning
+// The key of the map is the English index and the value is a set of the source ones
+typedef map <int, set<int> > HSentenceVertices;
+
+ REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ int, int, int, int, int, int, int,
+ bool (*)(int, int), bool (*)(int, int));
+ REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ int, int, int, int, int, int, int,
+ bool (*)(int, int), bool (*)(int, int),
+ const HSentenceVertices &, const HSentenceVertices &);
+ REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ int, int, int, int, int, int, int,
+ bool (*)(int, int), bool (*)(int, int),
+ const HSentenceVertices &, const HSentenceVertices &,
+ const HSentenceVertices &, const HSentenceVertices &,
+ REO_POS);
+
+ void insertVertex(HSentenceVertices &, int, int);
+ void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
+ int, int, int, int);
+ string getOrientString(REO_POS, REO_MODEL_TYPE);
+
+ bool ge(int, int);
+ bool le(int, int);
+ bool lt(int, int);
+
+ bool isAligned (SentenceAlignment &, int, int);
+ int sentenceOffset = 0;
+
+}
+
+namespace MosesTraining{
+
+class ExtractTask
+{
+public:
+ ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation):
+ m_sentence(sentence),
+ m_options(initoptions),
+ m_extractFile(extractFile),
+ m_extractFileInv(extractFileInv),
+ m_extractFileOrientation(extractFileOrientation){}
+void Run();
+private:
+ vector< string > m_extractedPhrases;
+ vector< string > m_extractedPhrasesInv;
+ vector< string > m_extractedPhrasesOri;
+ vector< string > m_extractedPhrasesSid;
+ void extractBase(SentenceAlignment &);
+ void extract(SentenceAlignment &);
+ void addPhrase(SentenceAlignment &, int, int, int, int, string &);
+ void writePhrasesToFile();
+
+ SentenceAlignment &m_sentence;
+ const PhraseExtractionOptions &m_options;
+ Moses::OutputFileStream &m_extractFile;
+ Moses::OutputFileStream &m_extractFileInv;
+ Moses::OutputFileStream &m_extractFileOrientation;
+};
+}
+
+int main(int argc, char* argv[])
+{
+ cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
+ << "phrase extraction from an aligned parallel corpus\n";
+
+ if (argc < 6) {
+ cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
+ cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n ]\n";
+ exit(1);
+ }
+
+ Moses::OutputFileStream extractFile;
+ Moses::OutputFileStream extractFileInv;
+ Moses::OutputFileStream extractFileOrientation;
+ const char* const &fileNameE = argv[1];
+ const char* const &fileNameF = argv[2];
+ const char* const &fileNameA = argv[3];
+ const string fileNameExtract = string(argv[4]);
+ PhraseExtractionOptions options(atoi(argv[5]));
+
+ for(int i=6; i<argc; i++) {
+ if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
+ options.initOnlyOutputSpanInfo(true);
+ } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
+ options.initOrientationFlag(true);
+ } else if (strcmp(argv[i],"--NoTTable") == 0) {
+ options.initTranslationFlag(false);
+ } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
+ options.initIncludeSentenceIdFlag(true);
+ } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
+ if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+ cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+ exit(1);
+ }
+ sentenceOffset = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "--GZOutput") == 0) {
+ options.initGzOutput(true);
+ } else if(strcmp(argv[i],"--model") == 0) {
+ if (i+1 >= argc) {
+ cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
+ exit(1);
+ }
+ char* modelParams = argv[++i];
+ char* modelName = strtok(modelParams, "-");
+ char* modelType = strtok(NULL, "-");
+
+ // REO_MODEL_TYPE intModelType;
+
+ if(strcmp(modelName, "wbe") == 0) {
+ options.initWordModel(true);
+ if(strcmp(modelType, "msd") == 0)
+ options.initWordType(REO_MSD);
+ else if(strcmp(modelType, "mslr") == 0)
+ options.initWordType(REO_MSLR);
+ else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
+ options.initWordType(REO_MONO);
+ else {
+ cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
+ exit(1);
+ }
+ } else if(strcmp(modelName, "phrase") == 0) {
+ options.initPhraseModel(true);
+ if(strcmp(modelType, "msd") == 0)
+ options.initPhraseType(REO_MSD);
+ else if(strcmp(modelType, "mslr") == 0)
+ options.initPhraseType(REO_MSLR);
+ else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
+ options.initPhraseType(REO_MONO);
+ else {
+ cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
+ exit(1);
+ }
+ } else if(strcmp(modelName, "hier") == 0) {
+ options.initHierModel(true);
+ if(strcmp(modelType, "msd") == 0)
+ options.initHierType(REO_MSD);
+ else if(strcmp(modelType, "mslr") == 0)
+ options.initHierType(REO_MSLR);
+ else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
+ options.initHierType(REO_MONO);
+ else {
+ cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
+ exit(1);
+ }
+ } else {
+ cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
+ exit(1);
+ }
+
+ options.initAllModelsOutputFlag(true);
+
+ } else {
+ cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
+ exit(1);
+ }
+ }
+
+ // default reordering model if no model selected
+ // allows for the old syntax to be used
+ if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
+ options.initWordModel(true);
+ options.initWordType(REO_MSD);
+ }
+
+ // open input files
+ Moses::InputFileStream eFile(fileNameE);
+ Moses::InputFileStream fFile(fileNameF);
+ Moses::InputFileStream aFile(fileNameA);
+
+ istream *eFileP = &eFile;
+ istream *fFileP = &fFile;
+ istream *aFileP = &aFile;
+
+ // open output files
+ if (options.isTranslationFlag()) {
+ string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
+ extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
+ extractFileInv.Open(fileNameExtractInv.c_str());
+ }
+ if (options.isOrientationFlag()) {
+ string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
+ extractFileOrientation.Open(fileNameExtractOrientation.c_str());
+ }
+
+ int i = sentenceOffset;
+ while(true) {
+ i++;
+ if (i%10000 == 0) cerr << "." << flush;
+ char englishString[LINE_MAX_LENGTH];
+ char foreignString[LINE_MAX_LENGTH];
+ char alignmentString[LINE_MAX_LENGTH];
+ SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (eFileP->eof()) break;
+ SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
+ SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
+ SentenceAlignment sentence;
+ // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
+ //az: output src, tgt, and alingment line
+ if (options.isOnlyOutputSpanInfo()) {
+ cout << "LOG: SRC: " << foreignString << endl;
+ cout << "LOG: TGT: " << englishString << endl;
+ cout << "LOG: ALT: " << alignmentString << endl;
+ cout << "LOG: PHRASES_BEGIN:" << endl;
+ }
+ if (sentence.create( englishString, foreignString, alignmentString, i, false)) {
+ ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation);
+ task->Run();
+ delete task;
+
+ }
+ if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+ }
+
+ eFile.Close();
+ fFile.Close();
+ aFile.Close();
+
+ //az: only close if we actually opened it
+ if (!options.isOnlyOutputSpanInfo()) {
+ if (options.isTranslationFlag()) {
+ extractFile.Close();
+ extractFileInv.Close();
+
+ }
+ if (options.isOrientationFlag()){
+ extractFileOrientation.Close();
+ }
+ }
+}
+
+namespace MosesTraining
+{
+void ExtractTask::Run() {
+ extract(m_sentence);
+ writePhrasesToFile();
+ m_extractedPhrases.clear();
+ m_extractedPhrasesInv.clear();
+ m_extractedPhrasesOri.clear();
+ m_extractedPhrasesSid.clear();
+
+}
+
+void ExtractTask::extract(SentenceAlignment &sentence)
+{
+ int countE = sentence.target.size();
+ int countF = sentence.source.size();
+
+ HPhraseVector inboundPhrases;
+
+ HSentenceVertices inTopLeft;
+ HSentenceVertices inTopRight;
+ HSentenceVertices inBottomLeft;
+ HSentenceVertices inBottomRight;
+
+ HSentenceVertices outTopLeft;
+ HSentenceVertices outTopRight;
+ HSentenceVertices outBottomLeft;
+ HSentenceVertices outBottomRight;
+
+ HSentenceVertices::const_iterator it;
+
+ bool relaxLimit = m_options.isHierModel();
+ bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
+
+ // check alignments for target phrase startE...endE
+ // loop over extracted phrases which are compatible with the word-alignments
+ for(int startE=0; startE<countE; startE++) {
+ for(int endE=startE;
+ (endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
+ endE++) {
+
+ int minF = 9999;
+ int maxF = -1;
+ vector< int > usedF = sentence.alignedCountS;
+ for(int ei=startE; ei<=endE; ei++) {
+ for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
+ int fi = sentence.alignedToT[ei][i];
+ if (fi<minF) {
+ minF = fi;
+ }
+ if (fi>maxF) {
+ maxF = fi;
+ }
+ usedF[ fi ]--;
+ }
+ }
+
+ if (maxF >= 0 && // aligned to any source words at all
+ (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
+
+ // check if source words are aligned to out of bound target words
+ bool out_of_bounds = false;
+ for(int fi=minF; fi<=maxF && !out_of_bounds; fi++)
+ if (usedF[fi]>0) {
+ // cout << "ouf of bounds: " << fi << "\n";
+ out_of_bounds = true;
+ }
+
+ // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
+ if (!out_of_bounds) {
+ // start point of source phrase may retreat over unaligned
+ for(int startF=minF;
+ (startF>=0 &&
+ (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
+ (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
+ startF--)
+ // end point of source phrase may advance over unaligned
+ for(int endF=maxF;
+ (endF<countF &&
+ (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
+ (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
+ endF++) { // at this point we have extracted a phrase
+ if(buildExtraStructure) { // phrase || hier
+ if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
+ inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
+ HPhraseVertex(endF,endE)));
+ insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
+ startF, startE, endF, endE);
+ } else
+ insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
+ startF, startE, endF, endE);
+ } else {
+ string orientationInfo = "";
+ if(m_options.isWordModel()) {
+ REO_POS wordPrevOrient, wordNextOrient;
+ bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
+ bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
+ bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
+ bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
+ wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
+ wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
+ orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
+ if(m_options.isAllModelsOutputFlag())
+ " | | ";
+ }
+ addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if(buildExtraStructure) { // phrase || hier
+ string orientationInfo = "";
+ REO_POS wordPrevOrient, wordNextOrient, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient;
+
+ for(size_t i = 0; i < inboundPhrases.size(); i++) {
+ int startF = inboundPhrases[i].first.first;
+ int startE = inboundPhrases[i].first.second;
+ int endF = inboundPhrases[i].second.first;
+ int endE = inboundPhrases[i].second.second;
+
+ bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
+ bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
+ bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
+ bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
+
+ if(m_options.isWordModel()) {
+ wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
+ connectedLeftTopP, connectedRightTopP,
+ startF, endF, startE, endE, countF, 0, 1,
+ &ge, &lt);
+ wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
+ connectedLeftTopN, connectedRightTopN,
+ endF, startF, endE, startE, 0, countF, -1,
+ &lt, &ge);
+ }
+ if (m_options.isPhraseModel()) {
+ phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
+ connectedLeftTopP, connectedRightTopP,
+ startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
+ phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
+ connectedLeftTopN, connectedRightTopN,
+ endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
+ } else {
+ phrasePrevOrient = phraseNextOrient = UNKNOWN;
+ }
+ if(m_options.isHierModel()) {
+ hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
+ connectedLeftTopP, connectedRightTopP,
+ startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
+ hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
+ connectedLeftTopN, connectedRightTopN,
+ endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
+ }
+
+ orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
+ ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
+ ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
+
+ addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
+ }
+ }
+}
+
+REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
+ bool connectedLeftTop, bool connectedRightTop,
+ int startF, int endF, int startE, int endE, int countF, int zero, int unit,
+ bool (*ge)(int, int), bool (*lt)(int, int) )
+{
+
+ if( connectedLeftTop && !connectedRightTop)
+ return LEFT;
+ if(modelType == REO_MONO)
+ return UNKNOWN;
+ if (!connectedLeftTop && connectedRightTop)
+ return RIGHT;
+ if(modelType == REO_MSD)
+ return UNKNOWN;
+ for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
+ connectedLeftTop = isAligned(sentence, indexF, startE-unit);
+ for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit)
+ connectedRightTop = isAligned(sentence, indexF, startE-unit);
+ if(connectedLeftTop && !connectedRightTop)
+ return DRIGHT;
+ else if(!connectedLeftTop && connectedRightTop)
+ return DLEFT;
+ return UNKNOWN;
+}
+
+// to be called with countF-1 instead of countF
+REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
+ bool connectedLeftTop, bool connectedRightTop,
+ int startF, int endF, int startE, int endE, int countF, int zero, int unit,
+ bool (*ge)(int, int), bool (*lt)(int, int),
+ const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft)
+{
+
+ HSentenceVertices::const_iterator it;
+
+ if((connectedLeftTop && !connectedRightTop) ||
+ //(startE == 0 && startF == 0) ||
+ //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) ||
+ ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(startF-unit) != it->second.end()))
+ return LEFT;
+ if(modelType == REO_MONO)
+ return UNKNOWN;
+ if((!connectedLeftTop && connectedRightTop) ||
+ ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end()))
+ return RIGHT;
+ if(modelType == REO_MSD)
+ return UNKNOWN;
+ connectedLeftTop = false;
+ for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
+ if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(indexF) != it->second.end())
+ return DRIGHT;
+ connectedRightTop = false;
+ for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
+ if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(indexF) != it->second.end())
+ return DLEFT;
+ return UNKNOWN;
+}
+
+// to be called with countF-1 instead of countF
+REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
+ bool connectedLeftTop, bool connectedRightTop,
+ int startF, int endF, int startE, int endE, int countF, int zero, int unit,
+ bool (*ge)(int, int), bool (*lt)(int, int),
+ const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft,
+ const HSentenceVertices & outBottomRight, const HSentenceVertices & outBottomLeft,
+ REO_POS phraseOrient)
+{
+
+ HSentenceVertices::const_iterator it;
+
+ if(phraseOrient == LEFT ||
+ (connectedLeftTop && !connectedRightTop) ||
+ // (startE == 0 && startF == 0) ||
+ //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) ||
+ ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(startF-unit) != it->second.end()) ||
+ ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
+ it->second.find(startF-unit) != it->second.end()))
+ return LEFT;
+ if(modelType == REO_MONO)
+ return UNKNOWN;
+ if(phraseOrient == RIGHT ||
+ (!connectedLeftTop && connectedRightTop) ||
+ ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() &&
+ it->second.find(endF + unit) != it->second.end()) ||
+ ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() &&
+ it->second.find(endF + unit) != it->second.end()))
+ return RIGHT;
+ if(modelType == REO_MSD)
+ return UNKNOWN;
+ if(phraseOrient != UNKNOWN)
+ return phraseOrient;
+ connectedLeftTop = false;
+ for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) {
+ if((connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(indexF) != it->second.end()) ||
+ (connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
+ it->second.find(indexF) != it->second.end()))
+ return DRIGHT;
+ }
+ connectedRightTop = false;
+ for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) {
+ if((connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(indexF) != it->second.end()) ||
+ (connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() &&
+ it->second.find(indexF) != it->second.end()))
+ return DLEFT;
+ }
+ return UNKNOWN;
+}
+
+bool isAligned ( SentenceAlignment &sentence, int fi, int ei )
+{
+ if (ei == -1 && fi == -1)
+ return true;
+ if (ei <= -1 || fi <= -1)
+ return false;
+ if ((size_t)ei == sentence.target.size() && (size_t)fi == sentence.source.size())
+ return true;
+ if ((size_t)ei >= sentence.target.size() || (size_t)fi >= sentence.source.size())
+ return false;
+ for(size_t i=0; i<sentence.alignedToT[ei].size(); i++)
+ if (sentence.alignedToT[ei][i] == fi)
+ return true;
+ return false;
+}
+
+bool ge(int first, int second)
+{
+ return first >= second;
+}
+
+bool le(int first, int second)
+{
+ return first <= second;
+}
+
+bool lt(int first, int second)
+{
+ return first < second;
+}
+
+void insertVertex( HSentenceVertices & corners, int x, int y )
+{
+ set<int> tmp;
+ tmp.insert(x);
+ pair< HSentenceVertices::iterator, bool > ret = corners.insert( pair<int, set<int> > (y, tmp) );
+ if(ret.second == false) {
+ ret.first->second.insert(x);
+ }
+}
+
+void insertPhraseVertices(
+ HSentenceVertices & topLeft,
+ HSentenceVertices & topRight,
+ HSentenceVertices & bottomLeft,
+ HSentenceVertices & bottomRight,
+ int startF, int startE, int endF, int endE)
+{
+
+ insertVertex(topLeft, startF, startE);
+ insertVertex(topRight, endF, startE);
+ insertVertex(bottomLeft, startF, endE);
+ insertVertex(bottomRight, endF, endE);
+}
+
+string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
+{
+ switch(orient) {
+ case LEFT:
+ return "mono";
+ break;
+ case RIGHT:
+ return "swap";
+ break;
+ case DRIGHT:
+ return "dright";
+ break;
+ case DLEFT:
+ return "dleft";
+ break;
+ case UNKNOWN:
+ switch(modelType) {
+ case REO_MONO:
+ return "nomono";
+ break;
+ case REO_MSD:
+ return "other";
+ break;
+ case REO_MSLR:
+ return "dright";
+ break;
+ }
+ break;
+ }
+ return "";
+}
+
+void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
+{
+ // source
+ // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+ ostringstream outextractstr;
+ ostringstream outextractstrInv;
+ ostringstream outextractstrOrientation;
+
+ if (m_options.isOnlyOutputSpanInfo()) {
+ cout << startF << " " << endF << " " << startE << " " << endE << endl;
+ return;
+ }
+
+for(int fi=startF; fi<=endF; fi++) {
+ if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
+ }
+ if (m_options.isTranslationFlag()) outextractstr << "||| ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+
+ // target
+ for(int ei=startE; ei<=endE; ei++) {
+ if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
+ if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
+ }
+ if (m_options.isTranslationFlag()) outextractstr << "|||";
+ if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+
+ // source (for inverse)
+
+ if (m_options.isTranslationFlag()) {
+ for(int fi=startF; fi<=endF; fi++)
+ outextractstrInv << sentence.source[fi] << " ";
+ outextractstrInv << "|||";
+ }
+ // alignment
+ if (m_options.isTranslationFlag()) {
+ for(int ei=startE; ei<=endE; ei++) {
+ for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
+ int fi = sentence.alignedToT[ei][i];
+ outextractstr << " " << fi-startF << "-" << ei-startE;
+ outextractstrInv << " " << ei-startE << "-" << fi-startF;
+ }
+ }
+ }
+
+ if (m_options.isOrientationFlag())
+ outextractstrOrientation << orientationInfo;
+
+ if (m_options.isIncludeSentenceIdFlag()) {
+ outextractstr << " ||| " << sentence.sentenceID;
+ }
+
+ if (m_options.isTranslationFlag()) outextractstr << "\n";
+ if (m_options.isTranslationFlag()) outextractstrInv << "\n";
+ if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
+
+
+ m_extractedPhrases.push_back(outextractstr.str());
+ m_extractedPhrasesInv.push_back(outextractstrInv.str());
+ m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
+}
+
+
+void ExtractTask::writePhrasesToFile(){
+
+ ostringstream outextractFile;
+ ostringstream outextractFileInv;
+ ostringstream outextractFileOrientation;
+
+ for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
+ outextractFile<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
+ outextractFileInv<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
+ outextractFileOrientation<<phrase->data();
+ }
+
+ m_extractFile << outextractFile.str();
+ m_extractFileInv << outextractFileInv.str();
+ m_extractFileOrientation << outextractFileOrientation.str();
+}
+
+// if proper conditioning, we need the number of times a source phrase occured
+
+void ExtractTask::extractBase( SentenceAlignment &sentence )
+{
+ ostringstream outextractFile;
+ ostringstream outextractFileInv;
+
+ int countF = sentence.source.size();
+ for(int startF=0; startF<countF; startF++) {
+ for(int endF=startF;
+ (endF<countF && endF<startF+m_options.maxPhraseLength);
+ endF++) {
+ for(int fi=startF; fi<=endF; fi++) {
+ outextractFile << sentence.source[fi] << " ";
+ }
+ outextractFile << "|||" << endl;
+ }
+ }
+
+ int countE = sentence.target.size();
+ for(int startE=0; startE<countE; startE++) {
+ for(int endE=startE;
+ (endE<countE && endE<startE+m_options.maxPhraseLength);
+ endE++) {
+ for(int ei=startE; ei<=endE; ei++) {
+ outextractFileInv << sentence.target[ei] << " ";
+ }
+ outextractFileInv << "|||" << endl;
+ }
+ }
+ m_extractFile << outextractFile.str();
+ m_extractFileInv << outextractFileInv.str();
+
+}
+
+}