Merge branch 'master' of https://github.com/moses-smt/mosesdecoder

author: Nicola Bertoldi <bertoldi@fbk.eu> 2015-04-30 09:35:41 +0300
committer: Nicola Bertoldi <bertoldi@fbk.eu> 2015-04-30 09:35:41 +0300
commit: 3400b622c0ff11ad690ef7933b062ec53789a13f (patch)
tree: f1bf07ba393a661b796813ffe0dabe812d4f7dd9 /scripts
parent: 5700fbaabf68339494c94affda856efcd1fd0818 (diff)
parent: eca582410006443d0b101a9ae188e302f34f8a03 (diff)
10 files changed, 199 insertions, 26 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index ead9ebe03..57ef4f9d6 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -165,11 +165,18 @@ get-corpus
 	pass-unless: get-corpus-script
 	default-name: lm/txt
 	template: $get-corpus-script > OUT
+use-parallel-corpus
+  in: parallel-corpus-stem
+  out: tokenized-corpus
+	default-name: lm/tok
+	ignore-unless: parallel-corpus-stem
+	template: ln -s IN.$output-extension  OUT	
 tokenize
 	in: raw-corpus
 	out: tokenized-corpus
 	default-name: lm/tok
 	pass-unless: output-tokenizer
+	ignore-if: parallel-corpus-stem
 	template: $output-tokenizer < IN > OUT
 	parallelizable: yes
 mock-parse
diff --git a/scripts/other/buckwalter.perl b/scripts/other/buckwalter.perl
new file mode 100755
index 000000000..62544e212
--- /dev/null
+++ b/scripts/other/buckwalter.perl
@@ -0,0 +1,33 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Encode::Arabic::Buckwalter;
+use Getopt::Long "GetOptions";
+
+my $direction;
+GetOptions('direction=i' => \$direction)
+    or exit(1);
+# direction: 1=arabic->bw, 2=bw->arabic
+
+die("ERROR: need to set direction") unless defined($direction);
+
+
+
+while (my $line = <STDIN>) {
+    chomp($line);
+
+    my $lineOut;
+    if ($direction == 1) {
+      $lineOut =  encode 'buckwalter', decode 'utf8', $line;
+    }
+    elsif ($direction == 2) {
+      $lineOut =  encode 'utf8', decode 'buckwalter', $line;
+    }
+    else {
+	die("Unknown direction: $direction");
+    }
+    print "$lineOut\n";
+
+}
+
diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py
new file mode 100644
index 000000000..76736da5c
--- /dev/null
+++ b/scripts/tokenizer/pre_tokenize_cleaning.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python -*- coding: utf-8 -*-
+
+"""
+The Gacha filter cleans out sentence pairs that have global character mean
+lower than a certain threshold. 
+ 
+Use this cleaner to produce low quantity of high quality sentence pairs. 
+
+It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during 
+WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.  
+(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
+
+This is inspired by the global character mean that is used in the Gale-Church
+algorithm (Gale aand Church, 1993), the c variable in:
+
+    delta = (l2-l1*c)/math.sqrt(l1*s2)
+
+where:
+ - l1 = len(source_sentence)
+ - l2 = len(target_sentence)
+ - c = global mean, i.e. #char in source corpus / #char in target corpus
+ - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1)
+
+(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
+"""
+
+import io, subprocess
+
+red = '\033[01;31m'
+native = '\033[m'
+
+def err_msg(txt):
+    return red+txt+native
+
+def num_char(filename):
+    return float(subprocess.Popen(["wc", "-m", filename], 
+                            stdout=subprocess.PIPE).stdout.read().split()[0])
+
+def gacha_mean(sourcefile, targetfile):
+    """
+    Counts the global character mean between source and target language as
+    in Gale-Church (1993)
+    """
+    sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
+    c = num_char(sourcefile) / num_char(targetfile)
+    sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
+    sys.stderr.write(err_msg('Filtering starts ...\n'))
+    return c
+
+def main(sourcefile, targetfile, threshold=0.2):
+    # Calculates Gacha mean.
+    c = gacha_mean(sourcefile, targetfile)
+    # Calculates lower and upperbound for filtering
+    threshold = float(threshold)
+    lowerbound = (1-threshold) * c
+    upperbound = (1+threshold) * c
+    
+    # Start filtering sentences.
+    with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
+    io.open(targetfile, 'r', encoding='utf8') as trgfin:
+        for s, t in zip(srcfin, trgfin):
+            if lowerbound < len(s) / float(len(t)) < upperbound:
+                print(u"{}\t{}\n".format(s.strip(),t.strip()))
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) not in range(3,5):
+        usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n'
+                            % sys.argv[0])
+        
+        example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de '
+                            '~/Europarl.de-en.en 0.4\n'
+                            % sys.argv[0])
+        sys.stderr.write(usage_msg)
+        sys.stderr.write(example_msg)
+        sys.exit(1)
+        
+    main(*sys.argv[1:])
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index 7dec0762c..1464fdb73 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -37,6 +37,7 @@ my $ZCAT = "gzip -cd";
 # get optional parameters
 my $opt_hierarchical = 0;
 my $binarizer = undef;
+my $threads = 1; # Default is single-thread, i.e. $threads=1
 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical";
 my $min_score = undef;
 my $opt_min_non_initial_rule_count = undef;
@@ -54,6 +55,7 @@ GetOptions(
     "SyntaxFilterCmd=s" => \$syntax_filter_cmd,
     "tempdir=s" => \$tempdir,
     "MinScore=s" => \$min_score,
+    "threads=i" => \$threads,
     "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count,  # DEPRECATED
 ) or exit(1);
 
@@ -63,7 +65,7 @@ my $config = shift;
 my $input = shift;
 
 if (!defined $dir || !defined $config || !defined $input) {
-  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n";
+  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd] [-threads num]\n";
   exit 1;
 }
 $dir = ensure_full_path($dir);
@@ -405,7 +407,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         # ... phrase translation model
         elsif ($binarizer =~ /processPhraseTableMin/) {
           #compact phrase table
-          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] && rm $mid_file.sorted";
+          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
           safesystem($cmd) or die "Can't binarize";
         } elsif ($binarizer =~ /CreateOnDiskPt/) {
       	  my $cmd = "$binarizer $mid_file $new_file.bin";
@@ -426,7 +428,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         $lexbin =~ s/PhraseTable/LexicalTable/;
         my $cmd;
         if ($lexbin =~ /processLexicalTableMin/) {
-          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file && rm $mid_file.sorted";
+          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
         } else {
           $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
           $cmd = "$lexbin -in $mid_file -out $new_file";
diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README
index 209daf1c0..ca2a06015 100644
--- a/scripts/training/rdlm/README
+++ b/scripts/training/rdlm/README
@@ -31,8 +31,8 @@ RDLM is split into two neural network models, which can be trained with
 
   mkdir working_dir_head
   mkdir working_dir_label
-  ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head  --output-dir /path/to/output_directory --output-model rdlm_head  --mode head  --output-vocab-size 500000 --noise-samples 100
-  ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50
+  ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_head  --output-dir /path/to/output_directory --output-model rdlm_head  --mode head  --output-vocab-size 500000 --noise 100
+  ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise 50
 
 for more options, run `train_rdlm.py --help`. Parameters you may want to adjust
 include the vocabulary size of the label model (depending on the number of
diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
index eca1b3a49..f3ce41080 100755
--- a/scripts/training/rdlm/extract_syntactic_ngrams.py
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -113,13 +113,14 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
             int_list.extend(parent_heads)
             int_list.extend(parent_labels)
 
+            # write root of tree
             if options.mode == 'label':
                 int_list.append(output_vocab.get(label, 0))
-                sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+                options.output.write(' '.join(map(str, int_list)) + '\n')
             elif options.mode == 'head' and not head == '<dummy_head>':
                 int_list.append(vocab.get(label, 0))
                 int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
-                sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+                options.output.write(' '.join(map(str, int_list)) + '\n')
 
             parent_heads.append(vocab.get(head, 0))
             parent_labels.append(vocab.get(label, 0))
diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py
index 55ecbe554..6d017602e 100755
--- a/scripts/training/rdlm/extract_vocab.py
+++ b/scripts/training/rdlm/extract_vocab.py
@@ -59,10 +59,6 @@ def get_head(xml, args):
             preterminal = child.get('label')
             head = escape_text(child.text.strip())
 
-        # hack for split compounds
-        elif child[-1].get('label') == 'SEGMENT':
-            return escape_text(child[-1].text.strip()), 'SEGMENT'
-
         elif args.ptkvz and head and child.get('label') == 'avz':
             for grandchild in child:
                 if grandchild.get('label') == 'PTKVZ':
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
index 1e7ecac52..15e56c430 100755
--- a/scripts/training/rdlm/train_rdlm.py
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -43,7 +43,7 @@ parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar
 parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)")
 parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)")
 parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)")
-parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
+parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
 
 
 parser.set_defaults(
@@ -95,7 +95,7 @@ def prepare_vocabulary(options):
       filtered_vocab = open(orig).readlines()
       orig = vocab_prefix + '.nonterminals'
       filtered_vocab += open(orig).readlines()
-      filtered_vocab = [word for word in filtered_vocab if not word.startswith(prefix) for prefix in blacklist]
+      filtered_vocab = [word for word in filtered_vocab if not any(word.startswith(prefix) for prefix in blacklist)]
       if options.output_vocab_size:
         filtered_vocab = filtered_vocab[:options.output_vocab_size]
     else:
@@ -127,12 +127,13 @@ def main(options):
   sys.stderr.write('extracting syntactic n-grams\n')
   extract_syntactic_ngrams.main(extract_options)
 
-  if validation_corpus:
-    extract_options.input = options.validation_corpus
-    options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized')
-    extract_options.output = options.validation_file
+  if options.validation_corpus:
+    extract_options.input = open(options.validation_corpus)
+    options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus))
+    extract_options.output = open(options.validation_file + '.numberized', 'w')
     sys.stderr.write('extracting syntactic n-grams (validation file)\n')
     extract_syntactic_ngrams.main(extract_options)
+    extract_options.output.close()
 
   sys.stderr.write('training neural network\n')
   train_nplm.main(options)
@@ -141,8 +142,8 @@ def main(options):
   ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'),
                    options.nplm_home,
                    os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
-                   os.path.join(options.working_dir, options.corpus_stem + '.numberized'),
-                   os.path.join(options.output_dir, options.output_model + '.model.nplm.')
+                   os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
+                   os.path.join(options.output_dir, options.output_model + '.model.nplm')
                    ])
   if ret:
       raise Exception("averaging null words failed")
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index fb63d4bbd..4c355479c 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl 
+#!/usr/bin/env perl
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl
index 6e7efe245..5c1d0404f 100755
--- a/scripts/training/wrappers/madamira-wrapper.perl
+++ b/scripts/training/wrappers/madamira-wrapper.perl
@@ -8,21 +8,38 @@ use File::Basename;
 use FindBin qw($RealBin);
 use Cwd 'abs_path';
 
+sub GetFactors;
+
+
 my $TMPDIR = "tmp";
 my $SCHEME = "D2";
 my $KEEP_TMP = 0;
 my $MADA_DIR;
+my $CONFIG;
+
+my $FACTORS_STR;
+my @FACTORS;
 
 GetOptions(
   "scheme=s" => \$SCHEME,
   "tmpdir=s" => \$TMPDIR,
   "keep-tmp" => \$KEEP_TMP,
-  "mada-dir=s" => \$MADA_DIR
+  "mada-dir=s" => \$MADA_DIR,
+  "factors=s" => \$FACTORS_STR,
+  "config=s" => \$CONFIG
     ) or die("ERROR: unknown options");
 
+if (!defined($CONFIG)) {
+  $CONFIG = "$MADA_DIR/samples/sampleConfigFile.xml";
+}
+
 $TMPDIR = abs_path($TMPDIR);
 print STDERR "TMPDIR=$TMPDIR \n";
 
+if (defined($FACTORS_STR)) {
+    @FACTORS = split(",", $FACTORS_STR);
+}
+
 #binmode(STDIN, ":utf8");
 #binmode(STDOUT, ":utf8");
 
@@ -54,7 +71,7 @@ else {
 $cmd = "$SPLIT_EXEC -l 10000 -a 7 -d  $TMPDIR/input $TMPDIR/split/x";
 `$cmd`;
 
-$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir  $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml  ::: $TMPDIR/split/x*";
+$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir  $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*";
 print STDERR "Executing: $cmd\n";
 `$cmd`;
 
@@ -66,7 +83,7 @@ print STDERR "Executing: $cmd\n";
 open(MADA_OUT,"<$infile.mada");
 #binmode(MADA_OUT, ":utf8");
 while(my $line = <MADA_OUT>) { 
-    chop($line);
+    chomp($line);
   #print STDERR "line=$line \n";
 
     if (index($line, "SENTENCE BREAK") == 0) {
@@ -75,13 +92,21 @@ while(my $line = <MADA_OUT>) {
 	print "\n";
     }
     elsif (index($line, ";;WORD") == 0) {
-    # word
+        # word
 	my $word = substr($line, 7, length($line) - 8);
-    #print STDERR "FOund $word\n";
+        #print STDERR "FOund $word\n";
+	
+	for (my $i = 0; $i < 4; ++$i) {
+	    $line = <MADA_OUT>;
+	}
+	
+	my $factors = GetFactors($line, \@FACTORS);
+	$word .= $factors;
+
 	print "$word ";
     }
     else {
-    #print STDERR "NADA\n";
+      #print STDERR "NADA\n";
     }
 }
 close (MADA_OUT);
@@ -91,3 +116,33 @@ if ($KEEP_TMP == 0) {
 #    `rm -rf $TMPDIR`;
 }
 
+
+###########################
+sub GetFactors
+{
+    my $line = shift;
+    my $factorsRef = shift;
+    my @factors = @{$factorsRef};
+
+    # all factors
+    my %allFactors;
+    my @toks = split(" ", $line);
+    for (my $i = 1; $i < scalar(@toks); ++$i) {
+	#print " tok=" .$toks[$i];
+
+        my ($key, $value) = split(":", $toks[$i]);
+	$allFactors{$key} = $value;
+    }
+
+    my $ret = "";
+    my $factorType;
+    foreach $factorType(@factors) {
+	#print "factorType=$factorType ";
+	my $value = $allFactors{$factorType};
+
+	$ret .= "|$value";
+    }
+    
+    return $ret;
+}
+
author	Nicola Bertoldi <bertoldi@fbk.eu>	2015-04-30 09:35:41 +0300
committer	Nicola Bertoldi <bertoldi@fbk.eu>	2015-04-30 09:35:41 +0300
commit	3400b622c0ff11ad690ef7933b062ec53789a13f (patch)
tree	f1bf07ba393a661b796813ffe0dabe812d4f7dd9 /scripts
parent	5700fbaabf68339494c94affda856efcd1fd0818 (diff)
parent	eca582410006443d0b101a9ae188e302f34f8a03 (diff)