diff options
author | Nicola Bertoldi <bertoldi@fbk.eu> | 2015-04-30 09:35:41 +0300 |
---|---|---|
committer | Nicola Bertoldi <bertoldi@fbk.eu> | 2015-04-30 09:35:41 +0300 |
commit | 3400b622c0ff11ad690ef7933b062ec53789a13f (patch) | |
tree | f1bf07ba393a661b796813ffe0dabe812d4f7dd9 /scripts | |
parent | 5700fbaabf68339494c94affda856efcd1fd0818 (diff) | |
parent | eca582410006443d0b101a9ae188e302f34f8a03 (diff) |
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/ems/experiment.meta | 7 | ||||
-rwxr-xr-x | scripts/other/buckwalter.perl | 33 | ||||
-rw-r--r-- | scripts/tokenizer/pre_tokenize_cleaning.py | 78 | ||||
-rwxr-xr-x | scripts/training/filter-model-given-input.pl | 8 | ||||
-rw-r--r-- | scripts/training/rdlm/README | 4 | ||||
-rwxr-xr-x | scripts/training/rdlm/extract_syntactic_ngrams.py | 5 | ||||
-rwxr-xr-x | scripts/training/rdlm/extract_vocab.py | 4 | ||||
-rwxr-xr-x | scripts/training/rdlm/train_rdlm.py | 17 | ||||
-rwxr-xr-x | scripts/training/train-model.perl | 2 | ||||
-rwxr-xr-x | scripts/training/wrappers/madamira-wrapper.perl | 67 |
10 files changed, 199 insertions, 26 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index ead9ebe03..57ef4f9d6 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -165,11 +165,18 @@ get-corpus pass-unless: get-corpus-script default-name: lm/txt template: $get-corpus-script > OUT +use-parallel-corpus + in: parallel-corpus-stem + out: tokenized-corpus + default-name: lm/tok + ignore-unless: parallel-corpus-stem + template: ln -s IN.$output-extension OUT tokenize in: raw-corpus out: tokenized-corpus default-name: lm/tok pass-unless: output-tokenizer + ignore-if: parallel-corpus-stem template: $output-tokenizer < IN > OUT parallelizable: yes mock-parse diff --git a/scripts/other/buckwalter.perl b/scripts/other/buckwalter.perl new file mode 100755 index 000000000..62544e212 --- /dev/null +++ b/scripts/other/buckwalter.perl @@ -0,0 +1,33 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Encode::Arabic::Buckwalter; +use Getopt::Long "GetOptions"; + +my $direction; +GetOptions('direction=i' => \$direction) + or exit(1); +# direction: 1=arabic->bw, 2=bw->arabic + +die("ERROR: need to set direction") unless defined($direction); + + + +while (my $line = <STDIN>) { + chomp($line); + + my $lineOut; + if ($direction == 1) { + $lineOut = encode 'buckwalter', decode 'utf8', $line; + } + elsif ($direction == 2) { + $lineOut = encode 'utf8', decode 'buckwalter', $line; + } + else { + die("Unknown direction: $direction"); + } + print "$lineOut\n"; + +} + diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py new file mode 100644 index 000000000..76736da5c --- /dev/null +++ b/scripts/tokenizer/pre_tokenize_cleaning.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python -*- coding: utf-8 -*- + +""" +The Gacha filter cleans out sentence pairs that have global character mean +lower than a certain threshold. + +Use this cleaner to produce low quantity of high quality sentence pairs. + +It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during +WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER. +(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf) + +This is inspired by the global character mean that is used in the Gale-Church +algorithm (Gale aand Church, 1993), the c variable in: + + delta = (l2-l1*c)/math.sqrt(l1*s2) + +where: + - l1 = len(source_sentence) + - l2 = len(target_sentence) + - c = global mean, i.e. #char in source corpus / #char in target corpus + - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1) + +(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf) +""" + +import io, subprocess + +red = '\033[01;31m' +native = '\033[m' + +def err_msg(txt): + return red+txt+native + +def num_char(filename): + return float(subprocess.Popen(["wc", "-m", filename], + stdout=subprocess.PIPE).stdout.read().split()[0]) + +def gacha_mean(sourcefile, targetfile): + """ + Counts the global character mean between source and target language as + in Gale-Church (1993) + """ + sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n')) + c = num_char(sourcefile) / num_char(targetfile) + sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n')) + sys.stderr.write(err_msg('Filtering starts ...\n')) + return c + +def main(sourcefile, targetfile, threshold=0.2): + # Calculates Gacha mean. + c = gacha_mean(sourcefile, targetfile) + # Calculates lower and upperbound for filtering + threshold = float(threshold) + lowerbound = (1-threshold) * c + upperbound = (1+threshold) * c + + # Start filtering sentences. + with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \ + io.open(targetfile, 'r', encoding='utf8') as trgfin: + for s, t in zip(srcfin, trgfin): + if lowerbound < len(s) / float(len(t)) < upperbound: + print(u"{}\t{}\n".format(s.strip(),t.strip())) + +if __name__ == '__main__': + import sys + if len(sys.argv) not in range(3,5): + usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n' + % sys.argv[0]) + + example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de ' + '~/Europarl.de-en.en 0.4\n' + % sys.argv[0]) + sys.stderr.write(usage_msg) + sys.stderr.write(example_msg) + sys.exit(1) + + main(*sys.argv[1:]) diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index 7dec0762c..1464fdb73 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -37,6 +37,7 @@ my $ZCAT = "gzip -cd"; # get optional parameters my $opt_hierarchical = 0; my $binarizer = undef; +my $threads = 1; # Default is single-thread, i.e. $threads=1 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; my $min_score = undef; my $opt_min_non_initial_rule_count = undef; @@ -54,6 +55,7 @@ GetOptions( "SyntaxFilterCmd=s" => \$syntax_filter_cmd, "tempdir=s" => \$tempdir, "MinScore=s" => \$min_score, + "threads=i" => \$threads, "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count, # DEPRECATED ) or exit(1); @@ -63,7 +65,7 @@ my $config = shift; my $input = shift; if (!defined $dir || !defined $config || !defined $input) { - print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n"; + print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd] [-threads num]\n"; exit 1; } $dir = ensure_full_path($dir); @@ -405,7 +407,7 @@ for(my $i=0;$i<=$#TABLE;$i++) { # ... phrase translation model elsif ($binarizer =~ /processPhraseTableMin/) { #compact phrase table - my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] && rm $mid_file.sorted"; + my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; safesystem($cmd) or die "Can't binarize"; } elsif ($binarizer =~ /CreateOnDiskPt/) { my $cmd = "$binarizer $mid_file $new_file.bin"; @@ -426,7 +428,7 @@ for(my $i=0;$i<=$#TABLE;$i++) { $lexbin =~ s/PhraseTable/LexicalTable/; my $cmd; if ($lexbin =~ /processLexicalTableMin/) { - $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file && rm $mid_file.sorted"; + $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted"; } else { $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options $cmd = "$lexbin -in $mid_file -out $new_file"; diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README index 209daf1c0..ca2a06015 100644 --- a/scripts/training/rdlm/README +++ b/scripts/training/rdlm/README @@ -31,8 +31,8 @@ RDLM is split into two neural network models, which can be trained with mkdir working_dir_head mkdir working_dir_label - ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise-samples 100 - ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50 + ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise 100 + ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise 50 for more options, run `train_rdlm.py --help`. Parameters you may want to adjust include the vocabulary size of the label model (depending on the number of diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py index eca1b3a49..f3ce41080 100755 --- a/scripts/training/rdlm/extract_syntactic_ngrams.py +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -113,13 +113,14 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p int_list.extend(parent_heads) int_list.extend(parent_labels) + # write root of tree if options.mode == 'label': int_list.append(output_vocab.get(label, 0)) - sys.stdout.write(' '.join(map(str, int_list)) + '\n') + options.output.write(' '.join(map(str, int_list)) + '\n') elif options.mode == 'head' and not head == '<dummy_head>': int_list.append(vocab.get(label, 0)) int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0))) - sys.stdout.write(' '.join(map(str, int_list)) + '\n') + options.output.write(' '.join(map(str, int_list)) + '\n') parent_heads.append(vocab.get(head, 0)) parent_labels.append(vocab.get(label, 0)) diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py index 55ecbe554..6d017602e 100755 --- a/scripts/training/rdlm/extract_vocab.py +++ b/scripts/training/rdlm/extract_vocab.py @@ -59,10 +59,6 @@ def get_head(xml, args): preterminal = child.get('label') head = escape_text(child.text.strip()) - # hack for split compounds - elif child[-1].get('label') == 'SEGMENT': - return escape_text(child[-1].text.strip()), 'SEGMENT' - elif args.ptkvz and head and child.get('label') == 'avz': for grandchild in child: if grandchild.get('label') == 'PTKVZ': diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py index 1e7ecac52..15e56c430 100755 --- a/scripts/training/rdlm/train_rdlm.py +++ b/scripts/training/rdlm/train_rdlm.py @@ -43,7 +43,7 @@ parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)") parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)") parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)") -parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)") +parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)") parser.set_defaults( @@ -95,7 +95,7 @@ def prepare_vocabulary(options): filtered_vocab = open(orig).readlines() orig = vocab_prefix + '.nonterminals' filtered_vocab += open(orig).readlines() - filtered_vocab = [word for word in filtered_vocab if not word.startswith(prefix) for prefix in blacklist] + filtered_vocab = [word for word in filtered_vocab if not any(word.startswith(prefix) for prefix in blacklist)] if options.output_vocab_size: filtered_vocab = filtered_vocab[:options.output_vocab_size] else: @@ -127,12 +127,13 @@ def main(options): sys.stderr.write('extracting syntactic n-grams\n') extract_syntactic_ngrams.main(extract_options) - if validation_corpus: - extract_options.input = options.validation_corpus - options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized') - extract_options.output = options.validation_file + if options.validation_corpus: + extract_options.input = open(options.validation_corpus) + options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus)) + extract_options.output = open(options.validation_file + '.numberized', 'w') sys.stderr.write('extracting syntactic n-grams (validation file)\n') extract_syntactic_ngrams.main(extract_options) + extract_options.output.close() sys.stderr.write('training neural network\n') train_nplm.main(options) @@ -141,8 +142,8 @@ def main(options): ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'), options.nplm_home, os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)), - os.path.join(options.working_dir, options.corpus_stem + '.numberized'), - os.path.join(options.output_dir, options.output_model + '.model.nplm.') + os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'), + os.path.join(options.output_dir, options.output_model + '.model.nplm') ]) if ret: raise Exception("averaging null words failed") diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index fb63d4bbd..4c355479c 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl use warnings; use strict; diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl index 6e7efe245..5c1d0404f 100755 --- a/scripts/training/wrappers/madamira-wrapper.perl +++ b/scripts/training/wrappers/madamira-wrapper.perl @@ -8,21 +8,38 @@ use File::Basename; use FindBin qw($RealBin); use Cwd 'abs_path'; +sub GetFactors; + + my $TMPDIR = "tmp"; my $SCHEME = "D2"; my $KEEP_TMP = 0; my $MADA_DIR; +my $CONFIG; + +my $FACTORS_STR; +my @FACTORS; GetOptions( "scheme=s" => \$SCHEME, "tmpdir=s" => \$TMPDIR, "keep-tmp" => \$KEEP_TMP, - "mada-dir=s" => \$MADA_DIR + "mada-dir=s" => \$MADA_DIR, + "factors=s" => \$FACTORS_STR, + "config=s" => \$CONFIG ) or die("ERROR: unknown options"); +if (!defined($CONFIG)) { + $CONFIG = "$MADA_DIR/samples/sampleConfigFile.xml"; +} + $TMPDIR = abs_path($TMPDIR); print STDERR "TMPDIR=$TMPDIR \n"; +if (defined($FACTORS_STR)) { + @FACTORS = split(",", $FACTORS_STR); +} + #binmode(STDIN, ":utf8"); #binmode(STDOUT, ":utf8"); @@ -54,7 +71,7 @@ else { $cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x"; `$cmd`; -$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml ::: $TMPDIR/split/x*"; +$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*"; print STDERR "Executing: $cmd\n"; `$cmd`; @@ -66,7 +83,7 @@ print STDERR "Executing: $cmd\n"; open(MADA_OUT,"<$infile.mada"); #binmode(MADA_OUT, ":utf8"); while(my $line = <MADA_OUT>) { - chop($line); + chomp($line); #print STDERR "line=$line \n"; if (index($line, "SENTENCE BREAK") == 0) { @@ -75,13 +92,21 @@ while(my $line = <MADA_OUT>) { print "\n"; } elsif (index($line, ";;WORD") == 0) { - # word + # word my $word = substr($line, 7, length($line) - 8); - #print STDERR "FOund $word\n"; + #print STDERR "FOund $word\n"; + + for (my $i = 0; $i < 4; ++$i) { + $line = <MADA_OUT>; + } + + my $factors = GetFactors($line, \@FACTORS); + $word .= $factors; + print "$word "; } else { - #print STDERR "NADA\n"; + #print STDERR "NADA\n"; } } close (MADA_OUT); @@ -91,3 +116,33 @@ if ($KEEP_TMP == 0) { # `rm -rf $TMPDIR`; } + +########################### +sub GetFactors +{ + my $line = shift; + my $factorsRef = shift; + my @factors = @{$factorsRef}; + + # all factors + my %allFactors; + my @toks = split(" ", $line); + for (my $i = 1; $i < scalar(@toks); ++$i) { + #print " tok=" .$toks[$i]; + + my ($key, $value) = split(":", $toks[$i]); + $allFactors{$key} = $value; + } + + my $ret = ""; + my $factorType; + foreach $factorType(@factors) { + #print "factorType=$factorType "; + my $value = $allFactors{$factorType}; + + $ret .= "|$value"; + } + + return $ret; +} + |