Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <fishandfrolick@gmail.com>2012-05-23 22:02:36 +0400
committerHieu Hoang <fishandfrolick@gmail.com>2012-05-23 22:02:36 +0400
commit761ac4f0fcbf1371daaeae65cfca7ded440fab9a (patch)
treebdfec7c7d5263106a774e862546abf446ccac3a3 /scripts/training
parentce3ad73ebfaab3e5a1ab72e88b1c8042661e30da (diff)
parallel extract. Works with reordering
Diffstat (limited to 'scripts/training')
-rwxr-xr-xscripts/training/train-model.perl.missing_bin_dir92
1 files changed, 56 insertions, 36 deletions
diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir
index c3efa4d30..ff5d7454e 100755
--- a/scripts/training/train-model.perl.missing_bin_dir
+++ b/scripts/training/train-model.perl.missing_bin_dir
@@ -19,7 +19,7 @@ if ($SCRIPTS_ROOTDIR eq '') {
$SCRIPTS_ROOTDIR =~ s/\/training$//;
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
-my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_CORPUS,
+my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_CORPUS,
$_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH,
$_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_VERBOSE, $_ALIGNMENT,
$_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS,
@@ -36,6 +36,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
$_ADDITIONAL_INI,
$_DICTIONARY, $_EPPEX);
+my $_CORES = 1;
my $debug = 0; # debug this script, do not delete any files in debug mode
@@ -58,6 +59,7 @@ $_HELP = 1
'temp-dir=s' => \$_TEMP_DIR,
'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
'sort-batch-size=s' => \$_SORT_BATCH_SIZE,
+ 'sort-compress=s' => \$_SORT_COMPRESS,
'extract-file=s' => \$_EXTRACT_FILE,
'alignment=s' => \$_ALIGNMENT,
'alignment-file=s' => \$_ALIGNMENT_FILE,
@@ -114,7 +116,8 @@ $_HELP = 1
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
'dictionary=s' => \$_DICTIONARY,
'eppex:s' => \$_EPPEX,
- 'additional-ini=s' => \$_ADDITIONAL_INI
+ 'additional-ini=s' => \$_ADDITIONAL_INI,
+ 'cores=i' => \$_CORES
);
if ($_HELP) {
@@ -206,8 +209,36 @@ if(!defined $_MGIZA ){
my $MKCLS = "$BINDIR/mkcls";
+# parallel extract
+my $SPLIT_EXEC = `gsplit --help 2>/dev/null`;
+if($SPLIT_EXEC) {
+ $SPLIT_EXEC = 'gsplit';
+}
+else {
+ $SPLIT_EXEC = 'split';
+}
+
+my $SORT_EXEC = `gsort --help 2>/dev/null`;
+if($SORT_EXEC) {
+ $SORT_EXEC = 'gsort';
+}
+else {
+ $SORT_EXEC = 'sort';
+}
+
+my $__SORT_BUFFER_SIZE = "";
+$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
+
+my $__SORT_BATCH_SIZE = "";
+$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE;
+
+my $__SORT_COMPRESS = "";
+$__SORT_COMPRESS = "--compress-program=$_SORT_COMPRESS" if $_SORT_COMPRESS;
+
# supporting scripts/binaries from this package
my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
+$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $PHRASE_EXTRACT";
+
my $RULE_EXTRACT;
if (defined($_GHKM)) {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-ghkm/tools/extract-ghkm";
@@ -215,6 +246,8 @@ if (defined($_GHKM)) {
else {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules";
}
+$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $RULE_EXTRACT";
+
my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score";
my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore";
my $EPPEX = "$SCRIPTS_ROOTDIR/training/eppex/eppex";
@@ -308,12 +341,6 @@ $_DONT_ZIP = $___DONT_ZIP unless $___DONT_ZIP;
my $___TEMP_DIR = $___MODEL_DIR;
$___TEMP_DIR = $_TEMP_DIR if $_TEMP_DIR;
-my $__SORT_BUFFER_SIZE = "";
-$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
-
-my $__SORT_BATCH_SIZE = "";
-$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE;
-
my $___CONTINUE = 0;
$___CONTINUE = $_CONTINUE if $_CONTINUE;
@@ -1590,35 +1617,28 @@ sub get_reordering_factored {
}
sub get_reordering {
- my ($extract_file,$reo_model_path) = @_;
- if (-e "$extract_file.o.gz") {
- safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip -c > $extract_file.o.sorted.gz") or die("ERROR");
- }
- else {
- safesystem("LC_ALL=C sort -T $___TEMP_DIR $extract_file.o | gzip -c > $extract_file.o.sorted.gz") or die("ERROR");
- }
-
- my $smooth = $___REORDERING_SMOOTH;
-
- print STDERR "(7.2) building tables @ ".`date`;
-
- #create cmd string for lexical reordering scoring
- my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path";
- $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
- for my $mtype (keys %REORDERING_MODEL_TYPES) {
- $cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
- foreach my $model (@REORDERING_MODELS) {
- if ($model->{"type"} eq $mtype) {
- $cmd .= " ".$model->{"filename"};
- }
+ my ($extract_file,$reo_model_path) = @_;
+ my $smooth = $___REORDERING_SMOOTH;
+
+ print STDERR "(7.2) building tables @ ".`date`;
+
+ #create cmd string for lexical reordering scoring
+ my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path";
+ $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
+ for my $mtype (keys %REORDERING_MODEL_TYPES) {
+ $cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
+ foreach my $model (@REORDERING_MODELS) {
+ if ($model->{"type"} eq $mtype) {
+ $cmd .= " ".$model->{"filename"};
+ }
+ }
+ $cmd .= "\"";
}
- $cmd .= "\"";
- }
-
- #Call the lexical reordering scorer
- safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
-
- if (! $debug) { safesystem("rm $extract_file.o.sorted.gz") or die("ERROR");}
+
+ #Call the lexical reordering scorer
+ safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
+
+ if (! $debug) { safesystem("rm $extract_file.o.sorted.gz") or die("ERROR");}
}