Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-06-26 02:37:28 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-06-26 02:37:28 +0400
commit765e789c0c904906098427af80f9520adff78c17 (patch)
tree58b27caff1593391b022b9b896e783ee46f443d8 /scripts
parent9bbe553a863cad1ffffd382f17e46135ea5567d6 (diff)
parent00f018a4772cebcbf705f8bad57bbd2576713de6 (diff)
Merge branch 'master' of git://github.com/moses-smt/mosesdecoder
Diffstat (limited to 'scripts')
-rw-r--r--scripts/ems/experiment.meta11
-rwxr-xr-xscripts/ems/experiment.perl99
-rwxr-xr-xscripts/recaser/train-recaser.perl1
-rwxr-xr-xscripts/training/train-model.perl28
-rwxr-xr-xscripts/training/wrappers/adam-suffix-array/suffix-array-create.sh33
-rwxr-xr-xscripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh25
-rwxr-xr-xscripts/training/wrappers/suffix-array-create.sh22
-rwxr-xr-xscripts/training/wrappers/suffix-array-extract.sh18
8 files changed, 174 insertions, 63 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 8fb50fb52..b294fde9d 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -332,6 +332,13 @@ build-biconcor
default-name: model/biconcor
ignore-unless: biconcor
error: usage
+build-suffix-array
+ in: word-alignment corpus
+ out: phrase-translation-table
+ default-name: model/suffix-array
+ ignore-unless: suffix-array
+ error: usage
+
build-lex-trans
in: word-alignment corpus
out: lexical-translation-table
@@ -362,6 +369,7 @@ extract-phrases
out: extracted-phrases
rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm
default-name: model/extract
+ ignore-if: suffix-array
build-reordering
in: extracted-phrases
out: reordering-table
@@ -373,6 +381,7 @@ build-ttable
out: phrase-translation-table
rerun-on-change: translation-factors hierarchical-rule-set score-settings training-options script EVALUATION:report-precision-by-coverage include-word-alignment-in-rules
default-name: model/phrase-table
+ ignore-if: suffix-array
build-generation
in: corpus
out: generation-table
@@ -820,6 +829,6 @@ analysis-precision
[REPORTING] single
report
- in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model
+ in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model
out: report
default-name: evaluation/report
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index ee6b47850..d105394a6 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -916,6 +916,10 @@ sub define_step {
elsif ($DO_STEP[$i] eq 'TRAINING:build-biconcor') {
&define_training_build_biconcor($i);
}
+ elsif ($DO_STEP[$i] eq 'TRAINING:build-suffix-array') {
+ &define_training_build_suffix_array($i);
+ }
+
elsif ($DO_STEP[$i] eq 'TRAINING:build-lex-trans') {
&define_training_build_lex_trans($i);
}
@@ -1632,6 +1636,24 @@ sub define_training_symmetrize_giza {
&create_step($step_id,$cmd);
}
+sub define_training_build_suffix_array {
+ my ($step_id) = @_;
+
+ my $scripts = &check_and_get("GENERAL:moses-script-dir");
+
+ my ($model, $aligned,$corpus) = &get_output_and_input($step_id);
+ my $sa_exec_dir = &check_and_get("TRAINING:suffix-array");
+ my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
+ my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
+ my $method = &check_and_get("TRAINING:alignment-symmetrization-method");
+
+ my $glue_grammar_file = &versionize(&long_file_name("glue-grammar","model",""));
+
+ my $cmd = "$scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh $sa_exec_dir $corpus.$input_extension $corpus.$output_extension $aligned.$method $model $glue_grammar_file";
+
+ &create_step($step_id,$cmd);
+}
+
sub define_training_build_biconcor {
my ($step_id) = @_;
@@ -1748,18 +1770,35 @@ sub define_training_build_custom_generation {
sub define_training_create_config {
my ($step_id) = @_;
- my ($config,
- $reordering_table,$phrase_translation_table,$generation_table,@LM)
- = &get_output_and_input($step_id);
+ my ($config,$reordering_table,$phrase_translation_table,$generation_table,@LM)
+ = &get_output_and_input($step_id);
my $cmd = &get_training_setting(9);
+ # get model, and whether suffix array is used. Determines the pt implementation.
+ my $hierarchical = &get("TRAINING:hierarchical-rule-set");
+ my $sa_exec_dir = &get("TRAINING:suffix-array");
+
+ my ($ptImpl, $numFF);
+ if ($hierarchical) {
+ if ($sa_exec_dir) {
+ $ptImpl = 10; # suffix array
+ $numFF = 7;
+ }
+ else {
+ $ptImpl = 6; # in-mem SCFG
+ }
+ }
+ else {
+ $ptImpl = 0; # phrase-based
+ }
+
# additional settings for factored models
- $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table",$phrase_translation_table);
- $cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table)
- if $reordering_table;
- $cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table)
- if $generation_table;
+ my $ptCmd = "$phrase_translation_table:$ptImpl";
+ $ptCmd .= ":$numFF" if defined($numFF);
+ $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table",$ptCmd);
+ $cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table) if $reordering_table;
+ $cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table) if $generation_table;
$cmd .= "-config $config ";
my $decoding_graph_backoff = &get("TRAINING:decoding-graph-backoff");
@@ -2139,8 +2178,7 @@ sub define_tuningevaluation_filter {
my $dir = &check_and_get("GENERAL:working-dir");
my $tuning_flag = !defined($set);
- my ($filter_dir,
- $input,$phrase_translation_table,$reordering_table) = &get_output_and_input($step_id);
+ my ($filter_dir,$input,$phrase_translation_table,$reordering_table) = &get_output_and_input($step_id);
my $binarizer = &get("GENERAL:ttable-binarizer");
my $hierarchical = &get("TRAINING:hierarchical-rule-set");
@@ -2164,10 +2202,31 @@ sub define_tuningevaluation_filter {
$settings .= " -Binarizer \"$binarizer\"" if $binarizer;
$settings .= " --Hierarchical" if &get("TRAINING:hierarchical-rule-set");
+ # get model, and whether suffix array is used. Determines the pt implementation.
+ my $sa_exec_dir = &get("TRAINING:suffix-array");
+
+ my ($ptImpl, $numFF);
+ if ($hierarchical) {
+ if ($sa_exec_dir) {
+ $ptImpl = 10; # suffix array
+ $numFF = 7;
+ }
+ else {
+ $ptImpl = 6; # in-mem SCFG
+ }
+ }
+ else {
+ $ptImpl = 0; # phrase-based
+ }
+
# create pseudo-config file
my $config = $tuning_flag ? "$dir/tuning/moses.table.ini.$VERSION" : "$dir/evaluation/$set.moses.table.ini.$VERSION";
my $cmd = &get_training_setting(9);
- $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table",$phrase_translation_table);
+
+ my $ptCmd = "$phrase_translation_table:$ptImpl";
+ $ptCmd .= ":$numFF" if defined($numFF);
+ $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
+
$cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table)
if $reordering_table;
# additional settings for hierarchical models
@@ -2184,9 +2243,21 @@ sub define_tuningevaluation_filter {
$cmd .= "-config $config\n";
# filter command
- $cmd .= "$scripts/training/filter-model-given-input.pl";
- $cmd .= " $filter_dir $config $input_filter $settings\n";
-
+ my $sa_exec_dir = &get("TRAINING:suffix-array");
+ if ($sa_exec_dir) {
+ # suffix array
+ $cmd .= "$scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $sa_exec_dir $phrase_translation_table $input_filter $filter_dir \n";
+
+ my $escaped_filter_dir = $filter_dir;
+ $escaped_filter_dir =~ s/\//\\\\\//g;
+ $cmd .= "cat $config | sed s/10\\ 0\\ 0\\ 7.*/10\\ 0\\ 0\\ 7\\ $escaped_filter_dir/g > $filter_dir/moses.ini \n";
+ }
+ else {
+ # normal phrase table
+ $cmd .= "$scripts/training/filter-model-given-input.pl";
+ $cmd .= " $filter_dir $config $input_filter $settings\n";
+ }
+
# clean-up
$cmd .= "rm $config";
diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl
index a5a707554..f669c5de6 100755
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@@ -149,7 +149,6 @@ sub train_recase_model {
else {
$cmd .= " --lm 0:3:$DIR/cased.srilm.gz:0";
}
- $cmd .= " -scripts-root-dir $SCRIPTS_ROOT_DIR" if $SCRIPTS_ROOT_DIR;
$cmd .= " -config $CONFIG" if $CONFIG;
print STDERR $cmd."\n";
system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n");
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 971cdbe06..05b463764 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1839,13 +1839,27 @@ sub create_ini {
$basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
$basic_weight_count++ if $_PCFG;
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
- $num_of_ttables++;
- my $ff = $f;
- $ff =~ s/\-/ /;
- my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
- $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
- my $phrase_table_impl = ($_HIERARCHICAL ? 6 : 0);
- print INI "$phrase_table_impl $ff $basic_weight_count $file\n";
+ $num_of_ttables++;
+ my $ff = $f;
+ $ff =~ s/\-/ /;
+ my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
+ my $phrase_table_impl = ($_HIERARCHICAL? 6 : 0);
+
+ if (scalar(@SPECIFIED_TABLE)) {
+ $file = shift @SPECIFIED_TABLE;
+ my @toks = split(/:/,$file);
+ $file = $toks[0];
+ $phrase_table_impl = $toks[1];
+
+ if (@toks == 3) {
+ $basic_weight_count = $toks[2];
+ }
+ }
+ else {
+
+ }
+
+ print INI "$phrase_table_impl $ff $basic_weight_count $file\n";
}
if ($_GLUE_GRAMMAR) {
&full_path(\$___GLUE_GRAMMAR_FILE);
diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh
new file mode 100755
index 000000000..e5210a990
--- /dev/null
+++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/suffix-array-create.sh $SA_EXEC_DIR $SOURCE_CORPUS $TARGET_CORPUS $ALIGNMENT $SA_OUTPUT
+
+# eg.
+#SA_EXEC_DIR=/Users/hieuhoang/workspace/github/cdec/sa-extract
+#SOURCE_CORPUS=/Users/hieuhoang/workspace/data/europarl/exp/fr-en/training/corpus.2.fr
+#TARGET_CORPUS=/Users/hieuhoang/workspace/data/europarl/exp/fr-en/training/corpus.2.en
+#ALIGNMENT=/Users/hieuhoang/workspace/data/europarl/exp/fr-en/model/aligned.3.grow-diag-final-and
+#SA_OUTPUT=/Users/hieuhoang/workspace/data/europarl/exp/fr-en/model/suffix-array.3
+
+
+SA_EXEC_DIR=$1
+SOURCE_CORPUS=$2
+TARGET_CORPUS=$3
+ALIGNMENT=$4
+SA_OUTPUT=$5
+GLUE_GRAMMAR=$6
+
+mkdir $SA_OUTPUT
+
+rm -rf $SA_OUTPUT/bitext
+
+pushd .
+cd $SA_EXEC_DIR
+
+./sa-compile.pl -output $SA_OUTPUT -b bitext_name=$SOURCE_CORPUS,$TARGET_CORPUS -a alignment_name=$ALIGNMENT > $SA_OUTPUT/extract.ini
+
+popd
+
+echo "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0\n" > $GLUE_GRAMMAR
+echo "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0\n" >> $GLUE_GRAMMAR
+echo "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0\n" >> $GLUE_GRAMMAR
diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh
new file mode 100755
index 000000000..eda11dede
--- /dev/null
+++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $SA_EXEC_DIR $MODEL_DIR $INPUT_FILE $OUTPUT_DIR
+
+# eg.
+#SA_EXEC_DIR=/Users/hieuhoang/workspace/github/cdec/sa-extract
+#MODEL_DIR=/Users/hieuhoang/workspace/data/europarl/exp/fr-en/model/suffix-array.3
+#INPUT_FILE=/Users/hieuhoang/workspace/data/europarl/exp/fr-en/tuning/input.lc.2
+#OUTPUT_DIR=/Users/hieuhoang/workspace/data/europarl/exp/fr-en/tuning/filtered.sa.3
+
+SA_EXEC_DIR=$1
+MODEL_DIR=$2
+INPUT_FILE=$3
+OUTPUT_DIR=$4
+
+mkdir $OUTPUT_DIR
+
+pushd .
+cd $OUTPUT_DIR
+
+cat $INPUT_FILE | $SA_EXEC_DIR/escape-testset.pl | $SA_EXEC_DIR/extractor.py -c $MODEL_DIR/extract.ini
+gzip $OUTPUT_DIR/grammar.out.*
+
+popd
+
diff --git a/scripts/training/wrappers/suffix-array-create.sh b/scripts/training/wrappers/suffix-array-create.sh
deleted file mode 100755
index 700269310..000000000
--- a/scripts/training/wrappers/suffix-array-create.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-# execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/suffix-array-create.sh $SA_EXEC_DIR $SOURCE_CORPUS $TARGET_CORPUS $ALIGNMENT $SA_OUTPUT
-
-
-SA_EXEC_DIR=$1
-SOURCE_CORPUS=$2
-TARGET_CORPUS=$3
-ALIGNMENT=$4
-SA_OUTPUT=$5
-
-mkdir $SA_OUTPUT
-
-rm -rf $SA_OUTPUT/bitext
-
-pushd .
-cd $SA_EXEC_DIR
-
-./sa-compile.pl -output $SA_OUTPUT -b bitext_name=$SOURCE_CORPUS,$TARGET_CORPUS -a alignment_name=$ALIGNMENT > $SA_OUTPUT/extract.ini
-
-popd
-
diff --git a/scripts/training/wrappers/suffix-array-extract.sh b/scripts/training/wrappers/suffix-array-extract.sh
deleted file mode 100755
index bc11e2cfc..000000000
--- a/scripts/training/wrappers/suffix-array-extract.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-# execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/suffix-array-extract.sh $SA_EXEC_DIR $MODEL_DIR $INPUT_FILE $OUTPUT_DIR
-
-SA_EXEC_DIR=$1
-MODEL_DIR=$2
-INPUT_FILE=$3
-OUTPUT_DIR=$4
-
-mkdir $OUTPUT_DIR
-
-pushd .
-cd $OUTPUT_DIR
-
-$SA_EXEC_DIR/extractor.py -c $MODEL_DIR/extract.ini < $INPUT_FILE
-
-popd
-