diff options
-rw-r--r-- | phrase-extract/PhraseExtractionOptions.h | 12 | ||||
-rw-r--r-- | phrase-extract/SentenceAlignment.cpp | 3 | ||||
-rw-r--r-- | phrase-extract/SentenceAlignment.h | 3 | ||||
-rw-r--r-- | phrase-extract/extract-main.cpp | 31 | ||||
-rw-r--r-- | phrase-extract/extract-rules-main.cpp | 2 |
5 files changed, 44 insertions, 7 deletions
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h index 6c7966736..2daeaf0ca 100644 --- a/phrase-extract/PhraseExtractionOptions.h +++ b/phrase-extract/PhraseExtractionOptions.h @@ -46,6 +46,7 @@ class PhraseExtractionOptions { bool includeSentenceIdFlag; //include sentence id in extract file bool onlyOutputSpanInfo; bool gzOutput; + std::string instanceWeightsFile; //weights for each sentence public: PhraseExtractionOptions(const int initmaxPhraseLength): @@ -99,7 +100,11 @@ public: } void initGzOutput (const bool initgzOutput){ gzOutput= initgzOutput; - } + } + void initInstanceWeightsFile(const char* initInstanceWeightsFile) { + instanceWeightsFile = std::string(initInstanceWeightsFile); + } + // functions for getting values bool isAllModelsOutputFlag() const { return allModelsOutputFlag; @@ -136,7 +141,10 @@ public: } bool isGzOutput () const { return gzOutput; - } + } + std::string getInstanceWeightsFile() const { + return instanceWeightsFile; + } }; } diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp index af1cfa953..96ef02865 100644 --- a/phrase-extract/SentenceAlignment.cpp +++ b/phrase-extract/SentenceAlignment.cpp @@ -54,10 +54,11 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bo return true; } -bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID, bool boundaryRules) +bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules) { using namespace std; this->sentenceID = sentenceID; + this->weightString = std::string(weightString); // process sentence strings and store in target and source members. if (!processTargetSentence(targetString, sentenceID, boundaryRules)) { diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h index 7c2988780..76cf950d4 100644 --- a/phrase-extract/SentenceAlignment.h +++ b/phrase-extract/SentenceAlignment.h @@ -35,6 +35,7 @@ public: std::vector<int> alignedCountS; std::vector<std::vector<int> > alignedToT; int sentenceID; + std::string weightString; virtual ~SentenceAlignment(); @@ -43,7 +44,7 @@ public: virtual bool processSourceSentence(const char *, int, bool boundaryRules); bool create(char targetString[], char sourceString[], - char alignmentString[], int sentenceID, bool boundaryRules); + char alignmentString[], char weightString[], int sentenceID, bool boundaryRules); }; diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 8749f5059..92c8a470e 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -114,7 +114,7 @@ int main(int argc, char* argv[]) if (argc < 6) { cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] "; - cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n ]\n"; + cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n"; exit(1); } @@ -144,6 +144,12 @@ int main(int argc, char* argv[]) sentenceOffset = atoi(argv[++i]); } else if (strcmp(argv[i], "--GZOutput") == 0) { options.initGzOutput(true); + } else if (strcmp(argv[i], "--InstanceWeights") == 0) { + if (i+1 >= argc) { + cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl; + exit(1); + } + options.initInstanceWeightsFile(argv[++i]); } else if(strcmp(argv[i],"--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model's information provided to the option --model " << endl; @@ -220,6 +226,13 @@ int main(int argc, char* argv[]) istream *fFileP = &fFile; istream *aFileP = &aFile; + istream *iwFileP = NULL; + auto_ptr<Moses::InputFileStream> instanceWeightsFile; + if (options.getInstanceWeightsFile().length()) { + instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile())); + iwFileP = instanceWeightsFile.get(); + } + // open output files if (options.isTranslationFlag()) { string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":""); @@ -238,10 +251,14 @@ int main(int argc, char* argv[]) char englishString[LINE_MAX_LENGTH]; char foreignString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; + char weightString[LINE_MAX_LENGTH]; SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__); if (eFileP->eof()) break; SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__); SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); + if (iwFileP) { + SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__); + } SentenceAlignment sentence; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line @@ -251,7 +268,7 @@ int main(int argc, char* argv[]) cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } - if (sentence.create( englishString, foreignString, alignmentString, i, false)) { + if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation); task->Run(); delete task; @@ -695,6 +712,16 @@ for(int fi=startF; fi<=endF; fi++) { if (m_options.isOrientationFlag()) outextractstrOrientation << orientationInfo; + if (m_options.getInstanceWeightsFile().length()) { + if (m_options.isTranslationFlag()) { + outextractstr << " ||| " << sentence.weightString; + outextractstrInv << " ||| " << sentence.weightString; + } + if (m_options.isOrientationFlag()) { + outextractstrOrientation << " ||| " << sentence.weightString; + } + } + if (m_options.isIncludeSentenceIdFlag()) { outextractstr << " ||| " << sentence.sentenceID; } diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 974e575b9..0ab678dbd 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -337,7 +337,7 @@ int main(int argc, char* argv[]) cout << "LOG: PHRASES_BEGIN:" << endl; } - if (sentence.create(targetString, sourceString, alignmentString, i, options.boundaryRules)) { + if (sentence.create(targetString, sourceString, alignmentString,"", i, options.boundaryRules)) { if (options.unknownWordLabelFlag) { collectWordLabelCounts(sentence); } |