diff options
-rwxr-xr-x | scripts/Transliteration/in-decoding-transliteration.pl | 230 | ||||
-rw-r--r-- | scripts/ems/experiment.meta | 19 | ||||
-rwxr-xr-x | scripts/ems/experiment.perl | 15 | ||||
-rwxr-xr-x | scripts/training/train-model.perl | 17 |
4 files changed, 270 insertions, 11 deletions
diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl new file mode 100755 index 000000000..e4f0503a8 --- /dev/null +++ b/scripts/Transliteration/in-decoding-transliteration.pl @@ -0,0 +1,230 @@ +#!/usr/bin/perl -w + +use strict; + +use utf8; +use File::Basename; +use Getopt::Long "GetOptions"; +use FindBin qw($RealBin); +use Scalar::Util qw(looks_like_number); +use IO::Handle; +binmode(STDIN, ':utf8'); +binmode(STDOUT, ':utf8'); +binmode(STDERR, ':utf8'); + +my $___FACTOR_DELIMITER = "|"; +my $OUT_FILE = "/tmp/transliteration-phrase-table.$$"; + +my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$OOV_FILE, $OOV_FILE_NAME, $EXTERNAL_BIN_DIR, $LM_FILE, $INPUT_EXTENSION, $OUTPUT_EXTENSION); +die("ERROR: wrong syntax when invoking postDecodingTransliteration.perl") + unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR, + 'external-bin-dir=s' => \$EXTERNAL_BIN_DIR, + 'transliteration-model-dir=s' => \$TRANSLIT_MODEL, + 'input-extension=s' => \$INPUT_EXTENSION, + 'output-extension=s' => \$OUTPUT_EXTENSION, + 'transliteration-file=s' => \$OOV_FILE, + 'out-file=s' => \$OUT_FILE); + +# check if the files are in place +die("ERROR: you need to define --moses-src-dir --external-bin-dir, --transliteration-model-dir, --transliteration-file, --input-extension, and --output-extension") + unless (defined($MOSES_SRC_DIR) && + defined($TRANSLIT_MODEL) && + defined($OOV_FILE) && + defined($INPUT_EXTENSION)&& + defined($OUTPUT_EXTENSION)&& + defined($EXTERNAL_BIN_DIR)); + +die("ERROR: could not find Transliteration Model '$TRANSLIT_MODEL'") + unless -e $TRANSLIT_MODEL; +die("ERROR: could not find Transliteration file $OOV_FILE'") + unless -e $OOV_FILE; + +$OOV_FILE_NAME = basename ($OOV_FILE); + +`mkdir $TRANSLIT_MODEL/evaluation`; +`cp $OOV_FILE $TRANSLIT_MODEL/evaluation/`; +my $translitFile = $TRANSLIT_MODEL . "/evaluation/" . $OOV_FILE_NAME; + +print "Preparing for Transliteration\n"; +prepare_for_transliteration ($OOV_FILE, $translitFile); +print "Run Transliteration\n"; +run_transliteration ($MOSES_SRC_DIR , $EXTERNAL_BIN_DIR , $TRANSLIT_MODEL , $OOV_FILE_NAME); +print "Pick Best Transliteration\n"; +form_corpus ($translitFile , $translitFile.".op.nBest" , $OUT_FILE); + + +################### Read the UNK word file and prepare for Transliteration ############################### + +sub prepare_for_transliteration +{ + my @list = @_; + my $testFile = $list[0]; + my $translitFile = $list[1]; + my %UNK; + my @words; + my $src; + my @tW; + + open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n"; + + while (<MYFILE>) + { + chomp; + #print "$_\n"; + @words = split(/ /, "$_"); + + foreach (@words) + { + + @tW = split /\Q$___FACTOR_DELIMITER/; + + if (defined $tW[0]) + { + + if (! ($tW[0] =~ /[0-9.,]/)) + { + $UNK{$tW[0]} = 1; + } + else + { + print "Not transliterating $tW[0] \n"; + } + } + } + } + close (MYFILE); + + open MYFILE, ">:encoding(UTF-8)", $translitFile or die "Can't open $translitFile: $!\n"; + + foreach my $key ( keys %UNK ) + { + $src=join(' ', split('',$key)); + print MYFILE "$src\n"; + } + close (MYFILE); +} + +################### Run Transliteration Module to Obtain Transliterations ############################### + +sub run_transliteration +{ + my @list = @_; + my $MOSES_SRC = $list[0]; + my $EXTERNAL_BIN_DIR = $list[1]; + my $TRANSLIT_MODEL = $list[2]; + my $eval_file = $list[3]; + + `touch $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`; + + print "Filter Table\n"; + + `$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`; + + `$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file -Binarizer "$MOSES_SRC/bin/processPhraseTable"`; + + `rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`; + + print "Apply Filter\n"; + + `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`; + + `$MOSES_SRC/bin/moses -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 100 distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini < $TRANSLIT_MODEL/evaluation/$eval_file > $TRANSLIT_MODEL/evaluation/$eval_file.op`; + +} + +################### Read the output of Transliteration Model and Form Corpus ############################### + + +sub form_corpus +{ + + my @list = @_; + my $inp_file = $list[0]; + my $testFile = $list[1]; + my @words; + my $thisStr; + my $features; + my $prev = 0; + my $sNum; + my @UNK; + my %vocab; + + my $antLog = exp(0.2); + my $phraseTable = $list[2]; + + open MYFILE, "<:encoding(UTF-8)", $inp_file or die "Can't open $inp_file: $!\n"; + open PT, ">:encoding(UTF-8)", $phraseTable or die "Can't open $phraseTable: $!\n"; + + while (<MYFILE>) + { + chomp; + #print "$_\n"; + @words = split(/ /, "$_"); + + $thisStr = ""; + foreach (@words) + { + $thisStr = $thisStr . "$_"; + } + + push(@UNK, $thisStr); + $vocab{$thisStr} = 1; + } + close (MYFILE); + + open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n"; + my $inpCount = 0; + + while (<MYFILE>) + { + chomp; + #print "$_\n"; + @words = split(/ /, "$_"); + + $sNum = $words[0]; + + if ($prev != $sNum){ + $inpCount++; + } + + my $i = 2; + $thisStr = ""; + $features = ""; + + while ($words[$i] ne "|||") + { + $thisStr = $thisStr . $words[$i]; + $i++; + } + + $i++; + + while ($words[$i] ne "|||") + { + if ($words[$i] =~ /Penalty0/ || $words[$i] eq "Distortion0=" || $words[$i] eq "LM0=" ){ + $i++; + } + elsif (looks_like_number($words[$i])){ + $features = $features . " " . exp($words[$i]); + } + + $i++; + } + $i++; + + #$features = $features . " " . $words[$i]; + + if ($thisStr ne ""){ + print PT "$UNK[$inpCount] ||| $thisStr ||| $features ||| 0-0 ||| 0 0 0\n"; + } + $prev = $sNum; + } + close (MYFILE); + close (PT); + + + `gzip $phraseTable`; + +} + + diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index e2b21019d..83d597aa0 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -533,6 +533,13 @@ build-transliteration-model ignore-unless: transliteration-module rerun-on-change: transliteration-module training-options script giza-settings default-name: model/Transliteration +build-translit-table + in: transliteration-model + out: transliteration-table + ignore-unless: in-decoding-transliteration + rerun-on-change: in-decoding-transliteration transliteration-module + default-name: model/transliteration-phrase-table + template: $moses-script-dir/Transliteration/in-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN --input-extension $input-extension --output-extension $output-extension --transliteration-file $transliteration-file --out-file OUT extract-phrases in: corpus-mml-postfilter=OR=word-alignment scored-corpus out: extracted-phrases @@ -601,7 +608,7 @@ build-sparse default-name: model/sparse-features template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features" create-config - in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-model generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm + in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm out: config ignore-if: use-hiero rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini @@ -863,7 +870,7 @@ split-reference-devtest multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-splitter -model IN1.$output-extension < IN > OUT filter - in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains + in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table out: filtered-dir default-name: tuning/filtered rerun-on-change: filter-settings ttable-binarizer @@ -989,8 +996,8 @@ split-input pass-unless: input-splitter template: $input-splitter -model IN1.$input-extension < IN > OUT filter - in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains - out: filtered-dir + in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table + out: filtered-dir default-name: evaluation/filtered rerun-on-change: filter-settings report-precision-by-coverage ttable-binarizer pass-if: TRAINING:binarize-all @@ -1027,11 +1034,11 @@ remove-markup pass-unless: report-segmentation template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT post-decoding-transliteration - in: cleaned-output system-output TRAINING:transliteration-model LM:binlm + in: cleaned-output system-output TRAINING:transliteration-model out: transliterated-output default-name: evaluation/transliterated pass-unless: TRAINING:post-decoding-transliteration - template: $moses-script-dir/Transliteration/post-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN2 --input-extension $input-extension --output-extension $output-extension --language-model IN3 --output-file IN0 --oov-file IN1.oov + template: $moses-script-dir/Transliteration/post-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN2 --input-extension $input-extension --output-extension $output-extension --language-model $TRAINING:language-model-file --output-file IN0 --oov-file IN1.oov recase-output in: transliterated-output RECASING:recase-config out: recased-output diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 761c7a694..f6a7e4db3 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2233,11 +2233,15 @@ sub get_config_tables { sub define_training_create_config { my ($step_id) = @_; - my ($config,$reordering_table,$phrase_translation_table,$translit_model,$generation_table,$sparse_lexical_features,$domains,$osm, @LM) + my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm, @LM) = &get_output_and_input($step_id); my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains); + if($transliteration_pt){ + $cmd .= "-transliteration-phrase-table $transliteration_pt "; + } + if($osm){ my $osm_settings = &get("TRAINING:operation-sequence-model-settings"); @@ -2623,7 +2627,7 @@ sub define_tuningevaluation_filter { my $tuning_flag = !defined($set); my $hierarchical = &get("TRAINING:hierarchical-rule-set"); - my ($filter_dir,$input,$phrase_translation_table,$reordering_table,$domains) = &get_output_and_input($step_id); + my ($filter_dir,$input,$phrase_translation_table,$reordering_table,$domains,$transliteration_table) = &get_output_and_input($step_id); my $binarizer; $binarizer = &backoff_and_get("EVALUATION:$set:ttable-binarizer") unless $tuning_flag; @@ -2683,7 +2687,14 @@ sub define_tuningevaluation_filter { $cmd .= &get_config_tables($config,$reordering_table,$phrase_translation_table,undef,$domains); + if (&get("TRAINING:in-decoding-transliteration")) { + + $cmd .= "-transliteration-phrase-table $dir/model/transliteration-phrase-table.$VERSION "; + } + + $cmd .= "-lm 0:3:$config:8\n"; # dummy kenlm 3-gram model on factor 0 + } # filter command diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 3764ab0c2..46a7e1fe6 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -31,7 +31,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_ $_DECODING_GRAPH_BACKOFF, $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE, @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS, - $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, + $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE, $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS, $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE, $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, @@ -122,7 +122,8 @@ $_HELP = 1 'config=s' => \$_CONFIG, 'osm-model=s' => \$_OSM, 'osm-setting=s' => \$_OSM_FACTORS, - 'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT, + 'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT, + 'transliteration-phrase-table=s' => \$_TRANSLITERATION_PHRASE_TABLE, 'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING, 'do-steps=s' => \$_DO_STEPS, 'memscore:s' => \$_MEMSCORE, @@ -1879,6 +1880,8 @@ sub create_ini { $path++; } print INI "1 T 1\n" if $_GLUE_GRAMMAR; + + print INI "1 T 1\n" if $_TRANSLITERATION_PHRASE_TABLE; if (defined($_DECODING_GRAPH_BACKOFF)) { $_DECODING_GRAPH_BACKOFF =~ s/\s+/ /g; @@ -1962,6 +1965,13 @@ sub create_ini { exit 1 if $i < $stepsused{"T"}; # fatal to define less } + if ($_TRANSLITERATION_PHRASE_TABLE){ + + $feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i table-limit=100 num-features=4 path=$_TRANSLITERATION_PHRASE_TABLE input-factor=0 output-factor=0\n"; + $weight_spec .= "TranslationModel$i= 0.2 0.2 0.2 0.2\n"; + $i++; + } + # glue grammar if ($_GLUE_GRAMMAR) { &full_path(\$___GLUE_GRAMMAR_FILE); @@ -2069,8 +2079,9 @@ sub create_ini { my $lm_oov_prob = 0.1; - if ($_POST_DECODING_TRANSLIT){ + if ($_POST_DECODING_TRANSLIT || $_TRANSLITERATION_PHRASE_TABLE){ $lm_oov_prob = -100.0; + $_LMODEL_OOV_FEATURE = "yes"; } $feature_spec .= "$type_name name=LM$i factor=$f path=$fn order=$o\n"; |