diff options
author | bhaddow <bhaddow@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-09-16 19:37:02 +0400 |
---|---|---|
committer | bhaddow <bhaddow@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-09-16 19:37:02 +0400 |
commit | 4d5b17f44432b0ffba35c630bc3ed58f3bb6bf2f (patch) | |
tree | fe0182dbd17792f7009307f8802343cad7e79394 /scripts | |
parent | 7927439ce5952aa57614cb3bd906179b3aedc217 (diff) |
Option to create extract file with sentence ids
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4229 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/training/phrase-extract/SentenceAlignment.cpp | 1 | ||||
-rw-r--r-- | scripts/training/phrase-extract/SentenceAlignment.h | 1 | ||||
-rw-r--r-- | scripts/training/phrase-extract/extract.cpp | 23 | ||||
-rwxr-xr-x | scripts/training/train-model.perl | 8 |
4 files changed, 29 insertions, 4 deletions
diff --git a/scripts/training/phrase-extract/SentenceAlignment.cpp b/scripts/training/phrase-extract/SentenceAlignment.cpp index 33ec74813..a94a34e36 100644 --- a/scripts/training/phrase-extract/SentenceAlignment.cpp +++ b/scripts/training/phrase-extract/SentenceAlignment.cpp @@ -40,6 +40,7 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int) bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID) { using namespace std; + this->sentenceID = sentenceID; // process sentence strings and store in target and source members. if (!processTargetSentence(targetString, sentenceID)) { diff --git a/scripts/training/phrase-extract/SentenceAlignment.h b/scripts/training/phrase-extract/SentenceAlignment.h index 1b42f6f92..3903602ea 100644 --- a/scripts/training/phrase-extract/SentenceAlignment.h +++ b/scripts/training/phrase-extract/SentenceAlignment.h @@ -31,6 +31,7 @@ public: std::vector<std::string> source; std::vector<int> alignedCountS; std::vector<std::vector<int> > alignedToT; + int sentenceID; virtual bool processTargetSentence(const char *, int); diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp index 0848723df..0d56de3c3 100644 --- a/scripts/training/phrase-extract/extract.cpp +++ b/scripts/training/phrase-extract/extract.cpp @@ -84,9 +84,11 @@ REO_MODEL_TYPE hierType = REO_MSD; ofstream extractFile; ofstream extractFileInv; ofstream extractFileOrientation; +ofstream extractFileSentenceId; int maxPhraseLength; bool orientationFlag = false; bool translationFlag = true; +bool sentenceIdFlag = false; //create extract file with sentence id bool onlyOutputSpanInfo = false; int main(int argc, char* argv[]) @@ -95,7 +97,7 @@ int main(int argc, char* argv[]) << "phrase extraction from an aligned parallel corpus\n"; if (argc < 6) { - cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo]\n"; + cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n"; exit(1); } char* &fileNameE = argv[1]; @@ -111,6 +113,8 @@ int main(int argc, char* argv[]) orientationFlag = true; } else if (strcmp(argv[i],"--NoTTable") == 0) { translationFlag = false; + } else if (strcmp(argv[i], "--SentenceId") == 0) { + sentenceIdFlag = true; } else if(strcmp(argv[i],"--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model's information provided to the option --model " << endl; @@ -199,6 +203,11 @@ int main(int argc, char* argv[]) extractFileOrientation.open(fileNameExtractOrientation.c_str()); } + if (sentenceIdFlag) { + string fileNameExtractSentenceId = fileNameExtract + ".sid"; + extractFileSentenceId.open(fileNameExtractSentenceId.c_str()); + } + int i=0; while(true) { i++; @@ -235,6 +244,9 @@ int main(int argc, char* argv[]) extractFileInv.close(); } if (orientationFlag) extractFileOrientation.close(); + if (sentenceIdFlag) { + extractFileSentenceId.close(); + } } } @@ -605,19 +617,23 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i for(int fi=startF; fi<=endF; fi++) { if (translationFlag) extractFile << sentence.source[fi] << " "; if (orientationFlag) extractFileOrientation << sentence.source[fi] << " "; + if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " "; } if (translationFlag) extractFile << "||| "; if (orientationFlag) extractFileOrientation << "||| "; + if (sentenceIdFlag) extractFileSentenceId << "||| "; // target for(int ei=startE; ei<=endE; ei++) { if (translationFlag) extractFile << sentence.target[ei] << " "; if (translationFlag) extractFileInv << sentence.target[ei] << " "; if (orientationFlag) extractFileOrientation << sentence.target[ei] << " "; + if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " "; } if (translationFlag) extractFile << "|||"; if (translationFlag) extractFileInv << "||| "; if (orientationFlag) extractFileOrientation << "||| "; + if (sentenceIdFlag) extractFileSentenceId << "||| "; // source (for inverse) if (translationFlag) { @@ -640,9 +656,14 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i if (orientationFlag) extractFileOrientation << orientationInfo; + if (sentenceIdFlag) { + extractFileSentenceId << sentence.sentenceID; + } + if (translationFlag) extractFile << "\n"; if (translationFlag) extractFileInv << "\n"; if (orientationFlag) extractFileOrientation << "\n"; + if (sentenceIdFlag) extractFileSentenceId << "\n"; } // if proper conditioning, we need the number of times a source phrase occured diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index ec0ecf79d..d360c6250 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -1317,9 +1317,10 @@ sub extract_phrase { $cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $max_length"; } if ($reordering_flag) { - $cmd .= " orientation"; - $cmd .= get_extract_reordering_flags(); - $cmd .= " --NoTTable" if !$ttable_flag; + $cmd .= " orientation"; + $cmd .= get_extract_reordering_flags(); + $cmd .= " --NoTTable" if !$ttable_flag; + $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS); } } map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a); @@ -1330,6 +1331,7 @@ sub extract_phrase { } if (! $___DONT_ZIP) { safesystem("gzip $extract_file.o") if -e "$extract_file.o"; + safesystem("gzip $extract_file.sid") if -e "$extract_file.sid"; if ($ttable_flag) { safesystem("gzip $extract_file.inv") or die("ERROR"); safesystem("gzip $extract_file") or die("ERROR"); |