Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbhaddow <bhaddow@1f5c12ca-751b-0410-a591-d2e778427230>2011-09-16 19:37:02 +0400
committerbhaddow <bhaddow@1f5c12ca-751b-0410-a591-d2e778427230>2011-09-16 19:37:02 +0400
commit4d5b17f44432b0ffba35c630bc3ed58f3bb6bf2f (patch)
treefe0182dbd17792f7009307f8802343cad7e79394 /scripts
parent7927439ce5952aa57614cb3bd906179b3aedc217 (diff)
Option to create extract file with sentence ids
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4229 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rw-r--r--scripts/training/phrase-extract/SentenceAlignment.cpp1
-rw-r--r--scripts/training/phrase-extract/SentenceAlignment.h1
-rw-r--r--scripts/training/phrase-extract/extract.cpp23
-rwxr-xr-xscripts/training/train-model.perl8
4 files changed, 29 insertions, 4 deletions
diff --git a/scripts/training/phrase-extract/SentenceAlignment.cpp b/scripts/training/phrase-extract/SentenceAlignment.cpp
index 33ec74813..a94a34e36 100644
--- a/scripts/training/phrase-extract/SentenceAlignment.cpp
+++ b/scripts/training/phrase-extract/SentenceAlignment.cpp
@@ -40,6 +40,7 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int)
bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID)
{
using namespace std;
+ this->sentenceID = sentenceID;
// process sentence strings and store in target and source members.
if (!processTargetSentence(targetString, sentenceID)) {
diff --git a/scripts/training/phrase-extract/SentenceAlignment.h b/scripts/training/phrase-extract/SentenceAlignment.h
index 1b42f6f92..3903602ea 100644
--- a/scripts/training/phrase-extract/SentenceAlignment.h
+++ b/scripts/training/phrase-extract/SentenceAlignment.h
@@ -31,6 +31,7 @@ public:
std::vector<std::string> source;
std::vector<int> alignedCountS;
std::vector<std::vector<int> > alignedToT;
+ int sentenceID;
virtual bool processTargetSentence(const char *, int);
diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp
index 0848723df..0d56de3c3 100644
--- a/scripts/training/phrase-extract/extract.cpp
+++ b/scripts/training/phrase-extract/extract.cpp
@@ -84,9 +84,11 @@ REO_MODEL_TYPE hierType = REO_MSD;
ofstream extractFile;
ofstream extractFileInv;
ofstream extractFileOrientation;
+ofstream extractFileSentenceId;
int maxPhraseLength;
bool orientationFlag = false;
bool translationFlag = true;
+bool sentenceIdFlag = false; //create extract file with sentence id
bool onlyOutputSpanInfo = false;
int main(int argc, char* argv[])
@@ -95,7 +97,7 @@ int main(int argc, char* argv[])
<< "phrase extraction from an aligned parallel corpus\n";
if (argc < 6) {
- cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo]\n";
+ cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
exit(1);
}
char* &fileNameE = argv[1];
@@ -111,6 +113,8 @@ int main(int argc, char* argv[])
orientationFlag = true;
} else if (strcmp(argv[i],"--NoTTable") == 0) {
translationFlag = false;
+ } else if (strcmp(argv[i], "--SentenceId") == 0) {
+ sentenceIdFlag = true;
} else if(strcmp(argv[i],"--model") == 0) {
if (i+1 >= argc) {
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
@@ -199,6 +203,11 @@ int main(int argc, char* argv[])
extractFileOrientation.open(fileNameExtractOrientation.c_str());
}
+ if (sentenceIdFlag) {
+ string fileNameExtractSentenceId = fileNameExtract + ".sid";
+ extractFileSentenceId.open(fileNameExtractSentenceId.c_str());
+ }
+
int i=0;
while(true) {
i++;
@@ -235,6 +244,9 @@ int main(int argc, char* argv[])
extractFileInv.close();
}
if (orientationFlag) extractFileOrientation.close();
+ if (sentenceIdFlag) {
+ extractFileSentenceId.close();
+ }
}
}
@@ -605,19 +617,23 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
for(int fi=startF; fi<=endF; fi++) {
if (translationFlag) extractFile << sentence.source[fi] << " ";
if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
+ if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
}
if (translationFlag) extractFile << "||| ";
if (orientationFlag) extractFileOrientation << "||| ";
+ if (sentenceIdFlag) extractFileSentenceId << "||| ";
// target
for(int ei=startE; ei<=endE; ei++) {
if (translationFlag) extractFile << sentence.target[ei] << " ";
if (translationFlag) extractFileInv << sentence.target[ei] << " ";
if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
+ if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
}
if (translationFlag) extractFile << "|||";
if (translationFlag) extractFileInv << "||| ";
if (orientationFlag) extractFileOrientation << "||| ";
+ if (sentenceIdFlag) extractFileSentenceId << "||| ";
// source (for inverse)
if (translationFlag) {
@@ -640,9 +656,14 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
if (orientationFlag)
extractFileOrientation << orientationInfo;
+ if (sentenceIdFlag) {
+ extractFileSentenceId << sentence.sentenceID;
+ }
+
if (translationFlag) extractFile << "\n";
if (translationFlag) extractFileInv << "\n";
if (orientationFlag) extractFileOrientation << "\n";
+ if (sentenceIdFlag) extractFileSentenceId << "\n";
}
// if proper conditioning, we need the number of times a source phrase occured
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index ec0ecf79d..d360c6250 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1317,9 +1317,10 @@ sub extract_phrase {
$cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $max_length";
}
if ($reordering_flag) {
- $cmd .= " orientation";
- $cmd .= get_extract_reordering_flags();
- $cmd .= " --NoTTable" if !$ttable_flag;
+ $cmd .= " orientation";
+ $cmd .= get_extract_reordering_flags();
+ $cmd .= " --NoTTable" if !$ttable_flag;
+ $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
}
}
map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
@@ -1330,6 +1331,7 @@ sub extract_phrase {
}
if (! $___DONT_ZIP) {
safesystem("gzip $extract_file.o") if -e "$extract_file.o";
+ safesystem("gzip $extract_file.sid") if -e "$extract_file.sid";
if ($ttable_flag) {
safesystem("gzip $extract_file.inv") or die("ERROR");
safesystem("gzip $extract_file") or die("ERROR");