From 5536e13213fe518039e08fae30d1a160e457717f Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Fri, 20 Mar 2015 18:44:59 +0100
Subject: added Gacha Filter from WMT14

---
 scripts/tokenizer/pre_tokenize_cleaning.py | 78 ++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 scripts/tokenizer/pre_tokenize_cleaning.py

(limited to 'scripts')

diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py
new file mode 100644
index 000000000..76736da5c
--- /dev/null
+++ b/scripts/tokenizer/pre_tokenize_cleaning.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python -*- coding: utf-8 -*-
+
+"""
+The Gacha filter cleans out sentence pairs that have global character mean
+lower than a certain threshold. 
+ 
+Use this cleaner to produce low quantity of high quality sentence pairs. 
+
+It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during 
+WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.  
+(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
+
+This is inspired by the global character mean that is used in the Gale-Church
+algorithm (Gale aand Church, 1993), the c variable in:
+
+    delta = (l2-l1*c)/math.sqrt(l1*s2)
+
+where:
+ - l1 = len(source_sentence)
+ - l2 = len(target_sentence)
+ - c = global mean, i.e. #char in source corpus / #char in target corpus
+ - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1)
+
+(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
+"""
+
+import io, subprocess
+
+red = '\033[01;31m'
+native = '\033[m'
+
+def err_msg(txt):
+    return red+txt+native
+
+def num_char(filename):
+    return float(subprocess.Popen(["wc", "-m", filename], 
+                            stdout=subprocess.PIPE).stdout.read().split()[0])
+
+def gacha_mean(sourcefile, targetfile):
+    """
+    Counts the global character mean between source and target language as
+    in Gale-Church (1993)
+    """
+    sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
+    c = num_char(sourcefile) / num_char(targetfile)
+    sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
+    sys.stderr.write(err_msg('Filtering starts ...\n'))
+    return c
+
+def main(sourcefile, targetfile, threshold=0.2):
+    # Calculates Gacha mean.
+    c = gacha_mean(sourcefile, targetfile)
+    # Calculates lower and upperbound for filtering
+    threshold = float(threshold)
+    lowerbound = (1-threshold) * c
+    upperbound = (1+threshold) * c
+    
+    # Start filtering sentences.
+    with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
+    io.open(targetfile, 'r', encoding='utf8') as trgfin:
+        for s, t in zip(srcfin, trgfin):
+            if lowerbound < len(s) / float(len(t)) < upperbound:
+                print(u"{}\t{}\n".format(s.strip(),t.strip()))
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) not in range(3,5):
+        usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n'
+                            % sys.argv[0])
+        
+        example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de '
+                            '~/Europarl.de-en.en 0.4\n'
+                            % sys.argv[0])
+        sys.stderr.write(usage_msg)
+        sys.stderr.write(example_msg)
+        sys.exit(1)
+        
+    main(*sys.argv[1:])
-- 
cgit v1.2.3


From 40933b4a782d43b9e07a89e62c4c6ffd1b12d1c5 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 22 Apr 2015 19:01:12 +0400
Subject: hack to allow target side of tokenized parallel corpus to be used for
 LM

---
 scripts/ems/experiment.meta | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'scripts')

diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index ead9ebe03..62f85eb1c 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -165,11 +165,18 @@ get-corpus
 	pass-unless: get-corpus-script
 	default-name: lm/txt
 	template: $get-corpus-script > OUT
+use-parallel-corpus
+  in: parallel-corpus-stem
+  out: tokenized-corpus
+	default-name: lm/tok
+	pass-unless: parallel-corpus-stem
+	template: ln -s IN.$output-extension  OUT	
 tokenize
 	in: raw-corpus
 	out: tokenized-corpus
 	default-name: lm/tok
 	pass-unless: output-tokenizer
+	ignore-if: parallel-corpus-stem
 	template: $output-tokenizer < IN > OUT
 	parallelizable: yes
 mock-parse
-- 
cgit v1.2.3


From 4b47e1148c7cfe771c8e813cb9d741c2de44ed42 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 22 Apr 2015 23:02:57 +0400
Subject: use ignore-unless /Philipp Koehn

---
 scripts/ems/experiment.meta       | 2 +-
 scripts/training/train-model.perl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'scripts')

diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 62f85eb1c..57ef4f9d6 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -169,7 +169,7 @@ use-parallel-corpus
   in: parallel-corpus-stem
   out: tokenized-corpus
 	default-name: lm/tok
-	pass-unless: parallel-corpus-stem
+	ignore-unless: parallel-corpus-stem
 	template: ln -s IN.$output-extension  OUT	
 tokenize
 	in: raw-corpus
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index fb63d4bbd..4c355479c 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl 
+#!/usr/bin/env perl
 
 use warnings;
 use strict;
-- 
cgit v1.2.3


From 585784f62aff40b1c81256a55db8fd97408de31c Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Fri, 24 Apr 2015 18:57:28 +0200
Subject: added thread options for filter-model-given-input.pl

---
 scripts/training/filter-model-given-input.pl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)
 mode change 100755 => 100644 scripts/training/filter-model-given-input.pl

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
old mode 100755
new mode 100644
index 3ce426c39..d238d7dcb
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -36,6 +36,7 @@ my $ZCAT = "gzip -cd";
 # get optional parameters
 my $opt_hierarchical = 0;
 my $binarizer = undef;
+my $threads = 1; # Default is single-thread
 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical";
 my $min_score = undef;
 my $opt_min_non_initial_rule_count = undef;
@@ -53,6 +54,7 @@ GetOptions(
     "SyntaxFilterCmd=s" => \$syntax_filter_cmd,
     "tempdir=s" => \$tempdir,
     "MinScore=s" => \$min_score,
+    "threads" => \$threads,
     "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count,  # DEPRECATED
 ) or exit(1);
 
@@ -404,8 +406,8 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         # ... phrase translation model
         elsif ($binarizer =~ /processPhraseTableMin/) {
           #compact phrase table
-          ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] && rm $mid_file.sorted";
-          my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -encoding None";
+          ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
+          my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None";
           safesystem($cmd) or die "Can't binarize";
         } elsif ($binarizer =~ /CreateOnDiskPt/) {
       	  my $cmd = "$binarizer $mid_file $new_file.bin";
@@ -426,7 +428,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         $lexbin =~ s/PhraseTable/LexicalTable/;
         my $cmd;
         if ($lexbin =~ /processLexicalTableMin/) {
-          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file && rm $mid_file.sorted";
+          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
         } else {
           $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
           $cmd = "$lexbin -in $mid_file -out $new_file";
-- 
cgit v1.2.3


From 6c63ca963c4d6f9455445df59ba51593888ef705 Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Fri, 24 Apr 2015 19:06:37 +0200
Subject: checks for undefined $threads

---
 scripts/training/filter-model-given-input.pl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index d238d7dcb..2ce1f26a2 100644
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -36,7 +36,7 @@ my $ZCAT = "gzip -cd";
 # get optional parameters
 my $opt_hierarchical = 0;
 my $binarizer = undef;
-my $threads = 1; # Default is single-thread
+my $threads = undef; # Default is single-thread
 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical";
 my $min_score = undef;
 my $opt_min_non_initial_rule_count = undef;
@@ -407,6 +407,9 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         elsif ($binarizer =~ /processPhraseTableMin/) {
           #compact phrase table
           ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
+          if(!defined($threads)) {
+          	$thread = 1
+          }
           my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None";
           safesystem($cmd) or die "Can't binarize";
         } elsif ($binarizer =~ /CreateOnDiskPt/) {
@@ -428,6 +431,9 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         $lexbin =~ s/PhraseTable/LexicalTable/;
         my $cmd;
         if ($lexbin =~ /processLexicalTableMin/) {
+          if(!defined($threads)) {
+          	$thread = 1
+          }
           $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
         } else {
           $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
-- 
cgit v1.2.3


From aa9207acfc1e8746d3bbb1626472f8906c7c7fa9 Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Fri, 24 Apr 2015 19:10:10 +0200
Subject: fixed typo in $thread to $threads

---
 scripts/training/filter-model-given-input.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index 2ce1f26a2..d015d3762 100644
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -408,7 +408,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
           #compact phrase table
           ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
           if(!defined($threads)) {
-          	$thread = 1
+          	$threads = 1
           }
           my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None";
           safesystem($cmd) or die "Can't binarize";
@@ -432,7 +432,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         my $cmd;
         if ($lexbin =~ /processLexicalTableMin/) {
           if(!defined($threads)) {
-          	$thread = 1
+          	$threads = 1
           }
           $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
         } else {
-- 
cgit v1.2.3


From 0ccbcaece6133e68782154c9b23c50f2393b10df Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Fri, 24 Apr 2015 19:11:57 +0200
Subject: added $threads option in usage example

---
 scripts/training/filter-model-given-input-new.pl | 537 +++++++++++++++++++++++
 scripts/training/filter-model-given-input.pl     |   4 +-
 2 files changed, 539 insertions(+), 2 deletions(-)
 create mode 100644 scripts/training/filter-model-given-input-new.pl

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input-new.pl b/scripts/training/filter-model-given-input-new.pl
new file mode 100644
index 000000000..029d83ed0
--- /dev/null
+++ b/scripts/training/filter-model-given-input-new.pl
@@ -0,0 +1,537 @@
+#!/usr/bin/perl -w
+
+# $Id$
+# Given a moses.ini file and an input text prepare minimized translation
+# tables and a new moses.ini, so that loading of tables is much faster.
+
+# original code by Philipp Koehn
+# changes by Ondrej Bojar
+# adapted for hierarchical models by Phil Williams
+
+use strict;
+
+use FindBin qw($RealBin);
+use Getopt::Long;
+
+my $SCRIPTS_ROOTDIR;
+if (defined($ENV{"SCRIPTS_ROOTDIR"})) {
+    $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"};
+} else {
+    $SCRIPTS_ROOTDIR = $RealBin;
+    if ($SCRIPTS_ROOTDIR eq '') {
+        $SCRIPTS_ROOTDIR = dirname(__FILE__);
+    }
+    $SCRIPTS_ROOTDIR =~ s/\/training$//;
+    $ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR;
+}
+
+# consider phrases in input up to $MAX_LENGTH
+# in other words, all phrase-tables will be truncated at least to 10 words per
+# phrase.
+my $MAX_LENGTH = 10;
+
+# utilities
+my $ZCAT = "gzip -cd";
+
+# get optional parameters
+my $opt_hierarchical = 0;
+my $binarizer = undef;
+my $threads = undef; # Default is single-thread
+my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical";
+my $min_score = undef;
+my $opt_min_non_initial_rule_count = undef;
+my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
+my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice
+my $opt_strip_xml = 1; # disabling XML stripping is required for STSG models where the input is a tree or forest
+my $tempdir = undef;
+
+GetOptions(
+    "gzip!" => \$opt_gzip,
+    "filter!" => \$opt_filter,
+    "Hierarchical" => \$opt_hierarchical,
+    "Binarizer=s" => \$binarizer,
+    "StripXml!" => \$opt_strip_xml,
+    "SyntaxFilterCmd=s" => \$syntax_filter_cmd,
+    "tempdir=s" => \$tempdir,
+    "MinScore=s" => \$min_score,
+    "threads" => \$threads,
+    "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count,  # DEPRECATED
+) or exit(1);
+
+# get command line parameters
+my $dir = shift;
+my $config = shift;
+my $input = shift;
+
+if (!defined $dir || !defined $config || !defined $input) {
+  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n";
+  exit 1;
+}
+$dir = ensure_full_path($dir);
+
+# Warn if deprecated -MinNonInitialRuleCount option is used
+if (defined($opt_min_non_initial_rule_count)) {
+  print STDERR "WARNING: -MinNonInitialRuleCount is deprecated; use score's -MinCountHierarchical option or set -SyntaxFilterCmd to \"$SCRIPTS_ROOTDIR/training/filter-rule-table.py --min-non-initial-rule=$opt_min_non_initial_rule_count\"\n";
+}
+
+$tempdir = $dir if !defined $tempdir; # use the working directory as temp by def.
+
+# decode min-score definitions
+my %MIN_SCORE;
+if ($min_score) {
+  foreach (split(/ *, */,$min_score)) {
+    my ($id,$score) = split(/ *: */);
+    $MIN_SCORE{$id} = $score;
+    print STDERR "score $id must be at least $score\n";
+  }
+}
+# buggy directory in place?
+if (-d $dir && ! -e "$dir/info") {
+    print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n";
+    exit(1);
+}
+
+# already filtered? check if it can be re-used
+if (-d $dir) {
+    my @INFO = `cat $dir/info`;
+    chop(@INFO);
+    if($INFO[0] ne $config 
+       || ($INFO[1] ne $input && 
+	   $INFO[1].".tagged" ne $input)) {
+      print STDERR "WARNING: directory exists but does not match parameters:\n";
+      print STDERR "  ($INFO[0] ne $config || $INFO[1] ne $input)\n";
+      exit 1;
+    }
+    print STDERR "The filtered model was ready in $dir, not doing anything.\n";
+    exit 0;
+}
+
+# filter the translation and distortion tables
+safesystem("mkdir -p $dir") or die "Can't mkdir $dir";
+
+my $cmd;
+if ($opt_strip_xml) {
+    my $inputStrippedXML = "$dir/input.$$";
+    $cmd = "$RealBin/../generic/strip-xml.perl < $input > $inputStrippedXML";
+    print STDERR "Stripping XML...\n";
+    safesystem($cmd) or die "Can't strip XML";
+    $input = $inputStrippedXML;
+}
+
+# get tables to be filtered (and modify config file)
+my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%KNOWN_TTABLE,@TABLE_WEIGHTS,%TABLE_NUMBER);
+
+my %new_name_used = ();
+open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini";
+open(INI,$config) or die "Can't read $config";
+while(my $line = <INI>) {
+  chomp($line);
+  my @toks = split(/ /, $line);
+  if ($line =~ /PhraseDictionaryMemory /
+     || $line =~ /PhraseDictionaryBinary /
+     || $line =~ /PhraseDictionaryOnDisk /
+     || $line =~ /PhraseDictionarySCFG /
+     || $line =~ /RuleTable /
+     ) {
+    print STDERR "pt:$line\n";
+
+		my ($phrase_table_impl,$source_factor,$t,$w,$file,$table_flag,$skip); # = ($1,$2,$3,$4,$5,$6,$7);
+    $table_flag = "";
+    $phrase_table_impl = $toks[0];
+    $skip = 0;
+    
+    for (my $i = 1; $i < scalar(@toks); ++$i) {
+      my @args = split(/=/, $toks[$i]);
+      chomp($args[0]);
+      chomp($args[1]);
+
+      if ($args[0] eq "num-features") {
+			  $w = $args[1];
+			}
+			elsif ($args[0] eq "input-factor") {
+			  $source_factor = $args[1];
+			}
+			elsif ($args[0] eq "output-factor") {
+			  $t = $args[1];
+			}
+			elsif ($args[0] eq "path") {
+			  $file = $args[1];
+			}
+			elsif ($args[0] eq "filterable" && $args[1] eq "false") {
+			  $skip = 1;
+			}
+    } #for (my $i = 1; $i < scalar(@toks); ++$i) {
+    
+		if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG" && $phrase_table_impl ne "RuleTable") || $file =~ /glue-grammar/ || $skip) {
+				# Only Memory ("0") and NewFormat ("6") can be filtered.
+				print INI_OUT "$line\n";
+				next;
+		}
+
+		push @TABLE, $file;
+		push @TABLE_WEIGHTS,$w;
+		$KNOWN_TTABLE{$#TABLE}++;
+
+  	my $new_name = "$dir/phrase-table.$source_factor-$t.".(++$TABLE_NUMBER{"$source_factor-$t"});
+		my $cnt = 1;
+		$cnt ++ while (defined $new_name_used{"$new_name.$cnt"});
+		$new_name .= ".$cnt";
+		$new_name_used{$new_name} = 1;
+		if ($binarizer && $phrase_table_impl eq "PhraseDictionarySCFG") {
+		  $phrase_table_impl = "PhraseDictionaryOnDisk";
+		  @toks = set_value(\@toks, "path", "$new_name.bin$table_flag");
+		}
+		elsif ($binarizer && $phrase_table_impl eq "PhraseDictionaryMemory") {
+			if ($binarizer =~ /processPhraseTableMin/) {
+  		  $phrase_table_impl = "PhraseDictionaryCompact";
+  		  @toks = set_value(\@toks, "path", "$new_name$table_flag");
+			}
+			elsif ($binarizer =~ /CreateOnDiskPt/) {
+  		  $phrase_table_impl = "PhraseDictionaryOnDisk";
+		  @toks = set_value(\@toks, "path", "$new_name.bin$table_flag");
+			}
+			else {
+  		  $phrase_table_impl = "PhraseDictionaryBinary";
+			  @toks = set_value(\@toks, "path", "$new_name$table_flag");
+			}
+		}
+		else {
+			$new_name .= ".gz" if $opt_gzip;
+		  @toks = set_value(\@toks, "path", "$new_name$table_flag");
+		}
+
+    $toks[0] = $phrase_table_impl;
+
+    print INI_OUT join_array(\@toks)."\n";
+
+		push @TABLE_NEW_NAME,$new_name;
+
+		$CONSIDER_FACTORS{$source_factor} = 1;
+			print STDERR "Considering factor $source_factor\n";
+		push @TABLE_FACTORS, $source_factor;
+		
+  } #if (/PhraseModel /) {
+  elsif ($line =~ /LexicalReordering /) {
+    print STDERR "ro:$line\n";
+		my ($source_factor, $t, $w, $file); # = ($1,$2,$3,$4);
+
+    for (my $i = 1; $i < scalar(@toks); ++$i) {
+      my @args = split(/=/, $toks[$i]);
+      chomp($args[0]);
+      chomp($args[1]);
+      
+			if ($args[0] eq "num-features") {
+			  $w = $args[1];
+			}
+			elsif ($args[0] eq "input-factor") {
+			  $source_factor = chomp($args[1]);
+			}
+			elsif ($args[0] eq "output-factor") {
+			  #$t = chomp($args[1]);
+			}
+			elsif ($args[0] eq "type") {
+			  $t = $args[1];
+			}
+			elsif ($args[0] eq "path") {
+			  $file = $args[1];
+			}
+
+		} # for (my $i = 1; $i < scalar(@toks); ++$i) {
+		
+  	push @TABLE, $file;
+	push @TABLE_WEIGHTS,$w;
+		
+		$file =~ s/^.*\/+([^\/]+)/$1/g;
+		my $new_name = "$dir/$file";
+		$new_name =~ s/\.gz//;
+		
+		#print INI_OUT "$source_factor $t $w $new_name\n";
+	  @toks = set_value(\@toks, "path", "$new_name");
+	  print INI_OUT join_array(\@toks)."\n";
+
+		push @TABLE_NEW_NAME,$new_name;
+
+		$CONSIDER_FACTORS{$source_factor} = 1;
+			print STDERR "Considering factor $source_factor\n";
+		push @TABLE_FACTORS,$source_factor;
+
+		
+  } #elsif (/LexicalReordering /) {
+  else {
+    print INI_OUT "$line\n";  
+  }
+} # while(<INI>) {
+close(INI);
+close(INI_OUT);
+
+my %TMP_INPUT_FILENAME;
+
+if ($opt_hierarchical) {
+  if (!$opt_strip_xml) {
+    print STDERR "WARNING: source factor reduction is disabled due to use of -noStripXML option\n";
+  } else {
+    # Write a separate, temporary input file for each combination of source
+    # factors
+    foreach my $key (keys %CONSIDER_FACTORS) {
+      my $filename = "$dir/input-$key";
+      open(FILEHANDLE,">$filename") or die "Can't open $filename for writing";
+      $TMP_INPUT_FILENAME{$key} = $filename;
+      my @FACTOR = split(/,/, $key);
+      my $cmd = "$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |";
+      print STDERR "Executing: $cmd\n";
+      open(PIPE,$cmd);
+      while (my $line = <PIPE>) {
+        print FILEHANDLE $line
+      }
+      close(FILEHANDLE);
+    }
+  }
+}
+
+my %PHRASE_USED;
+if ($opt_filter && !$opt_hierarchical) {
+    # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
+    open(INPUT,mk_open_string($input)) or die "Can't read $input";
+    while(my $line = <INPUT>) {
+        chomp($line);
+        my @WORD = split(/ +/,$line);
+        for(my $i=0;$i<=$#WORD;$i++) {
+            for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
+                foreach (keys %CONSIDER_FACTORS) {
+                    my @FACTOR = split(/,/);
+                    my $phrase = "";
+                    for(my $k=$i;$k<=$i+$j;$k++) {
+                        my @WORD_FACTOR = split(/\|/,$WORD[$k]);
+                        for(my $f=0;$f<=$#FACTOR;$f++) {
+                            $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
+                        }
+                        chop($phrase);
+                        $phrase .= " ";
+                    }
+                    chop($phrase);
+                    $PHRASE_USED{$_}{$phrase}++;
+                }
+            }
+        }
+    }
+    close(INPUT);
+}
+
+# filter files
+print STDERR "Filtering files...\n";
+for(my $i=0;$i<=$#TABLE;$i++) {
+    my ($used,$total) = (0,0);
+    my $file = $TABLE[$i];
+    my $factors = $TABLE_FACTORS[$i];
+    my $new_file = $TABLE_NEW_NAME[$i];
+    print STDERR "filtering $file -> $new_file...\n";
+    my $mid_file = $new_file; # used when both filtering and binarizing
+    if (!$opt_filter) {
+      # check if original file was gzipped
+      if ($file !~ /\.gz$/ && -e "$file.gz") {
+        $file .= ".gz";
+      }
+      $mid_file .= ".gz" if $file =~ /\.gz$/;
+      $cmd = "ln -s $file $mid_file";
+      safesystem($cmd) or die "Failed to make symlink";
+    } else {
+
+      $mid_file .= ".gz"
+        if $mid_file !~ /\.gz/
+           && $binarizer && $binarizer =~ /processPhraseTable/;
+
+      my $openstring = mk_open_string($file);
+
+      my $mid_openstring;
+      if ($mid_file =~ /\.gz$/) {
+        $mid_openstring = "| gzip -c > $mid_file";
+      } else {
+        $mid_openstring = ">$mid_file";
+      }
+
+
+      open(FILE_OUT,$mid_openstring) or die "Can't write to $mid_openstring";
+
+      if ($opt_hierarchical) {
+          my $input_file = $opt_strip_xml ? $TMP_INPUT_FILENAME{$factors} : $input;
+          $cmd = "$openstring $syntax_filter_cmd $input_file |";
+          print STDERR "Executing: $cmd\n";
+          open(PIPE,$cmd);
+          while (my $line = <PIPE>) {
+              print FILE_OUT $line
+          }
+          close(FILEHANDLE);
+      } else {
+          open(FILE,$openstring) or die "Can't open '$openstring'";
+          while(my $entry = <FILE>) {
+              my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
+              $foreign =~ s/ $//;
+              if (defined($PHRASE_USED{$factors}{$foreign})) {
+                  # handle min_score thresholds
+                  if ($min_score) {
+                     my @ITEM = split(/ *\|\|\| */,$rest);
+                     if(scalar (@ITEM)>2) { # do not filter reordering table
+                       my @SCORE = split(/ /,$ITEM[1]);
+                       my $okay = 1;
+                       foreach my $id (keys %MIN_SCORE) {
+                         $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id};
+                       }
+                       next unless $okay;
+                     }
+                  }
+                  print FILE_OUT $entry;
+                  $used++;
+              }
+              $total++;
+          }
+          close(FILE);
+          die "No phrases found in $file!" if $total == 0;
+          printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+      }
+
+      close(FILE_OUT);
+
+    }
+
+    my $catcmd = ($mid_file =~ /\.gz$/ ? "$ZCAT" : "cat");
+    if(defined($binarizer)) {
+      print STDERR "binarizing...\n";
+      # translation model
+      if ($KNOWN_TTABLE{$i}) {
+        # ... hierarchical translation model
+        if ($opt_hierarchical) {
+          my $cmd = "$binarizer $mid_file $new_file.bin";
+          safesystem($cmd) or die "Can't binarize";
+        }
+        # ... phrase translation model
+        elsif ($binarizer =~ /processPhraseTableMin/) {
+          #compact phrase table
+          ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
+          my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None";
+          safesystem($cmd) or die "Can't binarize";
+        } elsif ($binarizer =~ /CreateOnDiskPt/) {
+      	  my $cmd = "$binarizer $mid_file $new_file.bin";
+          safesystem($cmd) or die "Can't binarize";
+        } else { 
+          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
+          safesystem($cmd) or die "Can't binarize";
+        }
+      }
+      # reordering model
+      else {
+        my $lexbin;
+        $lexbin = $binarizer;
+        if ($binarizer =~ /CreateOnDiskPt/) {
+          $lexbin =~ s/CreateOnDiskPt/processLexicalTable/;
+        }
+
+        $lexbin =~ s/PhraseTable/LexicalTable/;
+        my $cmd;
+        if ($lexbin =~ /processLexicalTableMin/) {
+          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
+        } else {
+          $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
+          $cmd = "$lexbin -in $mid_file -out $new_file";
+        }
+        safesystem($cmd) or die "Can't binarize";
+      }
+    }
+}
+
+# Remove any temporary input files
+unlink values %TMP_INPUT_FILENAME;
+
+open(INFO,">$dir/info");
+print INFO "$config\n$input\n";
+close(INFO);
+
+
+print "To run the decoder, please call:
+  moses -f $dir/moses.ini -i $input\n";
+
+# functions
+sub mk_open_string {
+  my $file = shift;
+  my $openstring;
+  if ($file !~ /\.gz$/ && -e "$file.gz") {
+    $openstring = "$ZCAT $file.gz |";
+  } elsif ($file =~ /\.gz$/) {
+    $openstring = "$ZCAT $file |";
+  } elsif ($opt_hierarchical) {
+    $openstring = "cat $file |";
+  } else {
+    $openstring = "< $file";
+  }
+  return $openstring;
+}
+
+
+sub safesystem {
+  print STDERR "Executing: @_\n";
+  system("bash", "-c", @_);
+  if ($? == -1) {
+      print STDERR "Failed to execute: @_\n  $!\n";
+      exit(1);
+  }
+  elsif ($? & 127) {
+      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
+          ($? & 127),  ($? & 128) ? 'with' : 'without';
+      exit(1);
+  }
+  else {
+    my $exitcode = $? >> 8;
+    print STDERR "Exit code: $exitcode\n" if $exitcode;
+    return ! $exitcode;
+  }
+}
+
+sub ensure_full_path {
+    my $PATH = shift;
+    return $PATH if $PATH =~ /^\//;
+    my $dir = `pawd 2>/dev/null`;
+    if (!$dir) {$dir = `pwd`;}
+    chomp $dir;
+    $PATH = $dir."/".$PATH;
+    $PATH =~ s/[\r\n]//g;
+    $PATH =~ s/\/\.\//\//g;
+    $PATH =~ s/\/+/\//g;
+    my $sanity = 0;
+    while($PATH =~ /\/\.\.\// && $sanity++<10) {
+        $PATH =~ s/\/+/\//g;
+        $PATH =~ s/\/[^\/]+\/\.\.\//\//g;
+    }
+    $PATH =~ s/\/[^\/]+\/\.\.$//;
+    $PATH =~ s/\/+$//;
+    return $PATH;
+}
+
+sub join_array {
+  my @outside = @{$_[0]};
+   
+  my $ret = "";
+  for (my $i = 0; $i < scalar(@outside); ++$i) {
+    my $tok = $outside[$i];    
+    $ret .= "$tok ";
+  }
+  
+  return $ret;
+}
+
+sub set_value {
+  my @arr = @{$_[0]};
+  my $keySought = $_[1];
+  my $newValue = $_[2];
+
+  for (my $i = 1; $i < scalar(@arr); ++$i) {
+    my @inside = split(/=/, $arr[$i]);
+
+		my $key = $inside[0];
+		if ($key eq $keySought) {
+		  $arr[$i] = "$key=$newValue";
+		  return @arr;
+		}
+  }
+  return @arr;
+}
+
+
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index d015d3762..9373e44c1 100644
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -36,7 +36,7 @@ my $ZCAT = "gzip -cd";
 # get optional parameters
 my $opt_hierarchical = 0;
 my $binarizer = undef;
-my $threads = undef; # Default is single-thread
+my $threads = undef; # Default is single-thread, i.e. $threads=1
 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical";
 my $min_score = undef;
 my $opt_min_non_initial_rule_count = undef;
@@ -64,7 +64,7 @@ my $config = shift;
 my $input = shift;
 
 if (!defined $dir || !defined $config || !defined $input) {
-  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n";
+  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd] [-threads num]\n";
   exit 1;
 }
 $dir = ensure_full_path($dir);
-- 
cgit v1.2.3


From d453ccc9f5ac88fb1b346f622dd76512077181c7 Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Fri, 24 Apr 2015 19:14:04 +0200
Subject: removed wrongly added perl script...

---
 scripts/training/filter-model-given-input-new.pl | 537 -----------------------
 1 file changed, 537 deletions(-)
 delete mode 100644 scripts/training/filter-model-given-input-new.pl

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input-new.pl b/scripts/training/filter-model-given-input-new.pl
deleted file mode 100644
index 029d83ed0..000000000
--- a/scripts/training/filter-model-given-input-new.pl
+++ /dev/null
@@ -1,537 +0,0 @@
-#!/usr/bin/perl -w
-
-# $Id$
-# Given a moses.ini file and an input text prepare minimized translation
-# tables and a new moses.ini, so that loading of tables is much faster.
-
-# original code by Philipp Koehn
-# changes by Ondrej Bojar
-# adapted for hierarchical models by Phil Williams
-
-use strict;
-
-use FindBin qw($RealBin);
-use Getopt::Long;
-
-my $SCRIPTS_ROOTDIR;
-if (defined($ENV{"SCRIPTS_ROOTDIR"})) {
-    $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"};
-} else {
-    $SCRIPTS_ROOTDIR = $RealBin;
-    if ($SCRIPTS_ROOTDIR eq '') {
-        $SCRIPTS_ROOTDIR = dirname(__FILE__);
-    }
-    $SCRIPTS_ROOTDIR =~ s/\/training$//;
-    $ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR;
-}
-
-# consider phrases in input up to $MAX_LENGTH
-# in other words, all phrase-tables will be truncated at least to 10 words per
-# phrase.
-my $MAX_LENGTH = 10;
-
-# utilities
-my $ZCAT = "gzip -cd";
-
-# get optional parameters
-my $opt_hierarchical = 0;
-my $binarizer = undef;
-my $threads = undef; # Default is single-thread
-my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical";
-my $min_score = undef;
-my $opt_min_non_initial_rule_count = undef;
-my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
-my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice
-my $opt_strip_xml = 1; # disabling XML stripping is required for STSG models where the input is a tree or forest
-my $tempdir = undef;
-
-GetOptions(
-    "gzip!" => \$opt_gzip,
-    "filter!" => \$opt_filter,
-    "Hierarchical" => \$opt_hierarchical,
-    "Binarizer=s" => \$binarizer,
-    "StripXml!" => \$opt_strip_xml,
-    "SyntaxFilterCmd=s" => \$syntax_filter_cmd,
-    "tempdir=s" => \$tempdir,
-    "MinScore=s" => \$min_score,
-    "threads" => \$threads,
-    "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count,  # DEPRECATED
-) or exit(1);
-
-# get command line parameters
-my $dir = shift;
-my $config = shift;
-my $input = shift;
-
-if (!defined $dir || !defined $config || !defined $input) {
-  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n";
-  exit 1;
-}
-$dir = ensure_full_path($dir);
-
-# Warn if deprecated -MinNonInitialRuleCount option is used
-if (defined($opt_min_non_initial_rule_count)) {
-  print STDERR "WARNING: -MinNonInitialRuleCount is deprecated; use score's -MinCountHierarchical option or set -SyntaxFilterCmd to \"$SCRIPTS_ROOTDIR/training/filter-rule-table.py --min-non-initial-rule=$opt_min_non_initial_rule_count\"\n";
-}
-
-$tempdir = $dir if !defined $tempdir; # use the working directory as temp by def.
-
-# decode min-score definitions
-my %MIN_SCORE;
-if ($min_score) {
-  foreach (split(/ *, */,$min_score)) {
-    my ($id,$score) = split(/ *: */);
-    $MIN_SCORE{$id} = $score;
-    print STDERR "score $id must be at least $score\n";
-  }
-}
-# buggy directory in place?
-if (-d $dir && ! -e "$dir/info") {
-    print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n";
-    exit(1);
-}
-
-# already filtered? check if it can be re-used
-if (-d $dir) {
-    my @INFO = `cat $dir/info`;
-    chop(@INFO);
-    if($INFO[0] ne $config 
-       || ($INFO[1] ne $input && 
-	   $INFO[1].".tagged" ne $input)) {
-      print STDERR "WARNING: directory exists but does not match parameters:\n";
-      print STDERR "  ($INFO[0] ne $config || $INFO[1] ne $input)\n";
-      exit 1;
-    }
-    print STDERR "The filtered model was ready in $dir, not doing anything.\n";
-    exit 0;
-}
-
-# filter the translation and distortion tables
-safesystem("mkdir -p $dir") or die "Can't mkdir $dir";
-
-my $cmd;
-if ($opt_strip_xml) {
-    my $inputStrippedXML = "$dir/input.$$";
-    $cmd = "$RealBin/../generic/strip-xml.perl < $input > $inputStrippedXML";
-    print STDERR "Stripping XML...\n";
-    safesystem($cmd) or die "Can't strip XML";
-    $input = $inputStrippedXML;
-}
-
-# get tables to be filtered (and modify config file)
-my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%KNOWN_TTABLE,@TABLE_WEIGHTS,%TABLE_NUMBER);
-
-my %new_name_used = ();
-open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini";
-open(INI,$config) or die "Can't read $config";
-while(my $line = <INI>) {
-  chomp($line);
-  my @toks = split(/ /, $line);
-  if ($line =~ /PhraseDictionaryMemory /
-     || $line =~ /PhraseDictionaryBinary /
-     || $line =~ /PhraseDictionaryOnDisk /
-     || $line =~ /PhraseDictionarySCFG /
-     || $line =~ /RuleTable /
-     ) {
-    print STDERR "pt:$line\n";
-
-		my ($phrase_table_impl,$source_factor,$t,$w,$file,$table_flag,$skip); # = ($1,$2,$3,$4,$5,$6,$7);
-    $table_flag = "";
-    $phrase_table_impl = $toks[0];
-    $skip = 0;
-    
-    for (my $i = 1; $i < scalar(@toks); ++$i) {
-      my @args = split(/=/, $toks[$i]);
-      chomp($args[0]);
-      chomp($args[1]);
-
-      if ($args[0] eq "num-features") {
-			  $w = $args[1];
-			}
-			elsif ($args[0] eq "input-factor") {
-			  $source_factor = $args[1];
-			}
-			elsif ($args[0] eq "output-factor") {
-			  $t = $args[1];
-			}
-			elsif ($args[0] eq "path") {
-			  $file = $args[1];
-			}
-			elsif ($args[0] eq "filterable" && $args[1] eq "false") {
-			  $skip = 1;
-			}
-    } #for (my $i = 1; $i < scalar(@toks); ++$i) {
-    
-		if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG" && $phrase_table_impl ne "RuleTable") || $file =~ /glue-grammar/ || $skip) {
-				# Only Memory ("0") and NewFormat ("6") can be filtered.
-				print INI_OUT "$line\n";
-				next;
-		}
-
-		push @TABLE, $file;
-		push @TABLE_WEIGHTS,$w;
-		$KNOWN_TTABLE{$#TABLE}++;
-
-  	my $new_name = "$dir/phrase-table.$source_factor-$t.".(++$TABLE_NUMBER{"$source_factor-$t"});
-		my $cnt = 1;
-		$cnt ++ while (defined $new_name_used{"$new_name.$cnt"});
-		$new_name .= ".$cnt";
-		$new_name_used{$new_name} = 1;
-		if ($binarizer && $phrase_table_impl eq "PhraseDictionarySCFG") {
-		  $phrase_table_impl = "PhraseDictionaryOnDisk";
-		  @toks = set_value(\@toks, "path", "$new_name.bin$table_flag");
-		}
-		elsif ($binarizer && $phrase_table_impl eq "PhraseDictionaryMemory") {
-			if ($binarizer =~ /processPhraseTableMin/) {
-  		  $phrase_table_impl = "PhraseDictionaryCompact";
-  		  @toks = set_value(\@toks, "path", "$new_name$table_flag");
-			}
-			elsif ($binarizer =~ /CreateOnDiskPt/) {
-  		  $phrase_table_impl = "PhraseDictionaryOnDisk";
-		  @toks = set_value(\@toks, "path", "$new_name.bin$table_flag");
-			}
-			else {
-  		  $phrase_table_impl = "PhraseDictionaryBinary";
-			  @toks = set_value(\@toks, "path", "$new_name$table_flag");
-			}
-		}
-		else {
-			$new_name .= ".gz" if $opt_gzip;
-		  @toks = set_value(\@toks, "path", "$new_name$table_flag");
-		}
-
-    $toks[0] = $phrase_table_impl;
-
-    print INI_OUT join_array(\@toks)."\n";
-
-		push @TABLE_NEW_NAME,$new_name;
-
-		$CONSIDER_FACTORS{$source_factor} = 1;
-			print STDERR "Considering factor $source_factor\n";
-		push @TABLE_FACTORS, $source_factor;
-		
-  } #if (/PhraseModel /) {
-  elsif ($line =~ /LexicalReordering /) {
-    print STDERR "ro:$line\n";
-		my ($source_factor, $t, $w, $file); # = ($1,$2,$3,$4);
-
-    for (my $i = 1; $i < scalar(@toks); ++$i) {
-      my @args = split(/=/, $toks[$i]);
-      chomp($args[0]);
-      chomp($args[1]);
-      
-			if ($args[0] eq "num-features") {
-			  $w = $args[1];
-			}
-			elsif ($args[0] eq "input-factor") {
-			  $source_factor = chomp($args[1]);
-			}
-			elsif ($args[0] eq "output-factor") {
-			  #$t = chomp($args[1]);
-			}
-			elsif ($args[0] eq "type") {
-			  $t = $args[1];
-			}
-			elsif ($args[0] eq "path") {
-			  $file = $args[1];
-			}
-
-		} # for (my $i = 1; $i < scalar(@toks); ++$i) {
-		
-  	push @TABLE, $file;
-	push @TABLE_WEIGHTS,$w;
-		
-		$file =~ s/^.*\/+([^\/]+)/$1/g;
-		my $new_name = "$dir/$file";
-		$new_name =~ s/\.gz//;
-		
-		#print INI_OUT "$source_factor $t $w $new_name\n";
-	  @toks = set_value(\@toks, "path", "$new_name");
-	  print INI_OUT join_array(\@toks)."\n";
-
-		push @TABLE_NEW_NAME,$new_name;
-
-		$CONSIDER_FACTORS{$source_factor} = 1;
-			print STDERR "Considering factor $source_factor\n";
-		push @TABLE_FACTORS,$source_factor;
-
-		
-  } #elsif (/LexicalReordering /) {
-  else {
-    print INI_OUT "$line\n";  
-  }
-} # while(<INI>) {
-close(INI);
-close(INI_OUT);
-
-my %TMP_INPUT_FILENAME;
-
-if ($opt_hierarchical) {
-  if (!$opt_strip_xml) {
-    print STDERR "WARNING: source factor reduction is disabled due to use of -noStripXML option\n";
-  } else {
-    # Write a separate, temporary input file for each combination of source
-    # factors
-    foreach my $key (keys %CONSIDER_FACTORS) {
-      my $filename = "$dir/input-$key";
-      open(FILEHANDLE,">$filename") or die "Can't open $filename for writing";
-      $TMP_INPUT_FILENAME{$key} = $filename;
-      my @FACTOR = split(/,/, $key);
-      my $cmd = "$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |";
-      print STDERR "Executing: $cmd\n";
-      open(PIPE,$cmd);
-      while (my $line = <PIPE>) {
-        print FILEHANDLE $line
-      }
-      close(FILEHANDLE);
-    }
-  }
-}
-
-my %PHRASE_USED;
-if ($opt_filter && !$opt_hierarchical) {
-    # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
-    open(INPUT,mk_open_string($input)) or die "Can't read $input";
-    while(my $line = <INPUT>) {
-        chomp($line);
-        my @WORD = split(/ +/,$line);
-        for(my $i=0;$i<=$#WORD;$i++) {
-            for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
-                foreach (keys %CONSIDER_FACTORS) {
-                    my @FACTOR = split(/,/);
-                    my $phrase = "";
-                    for(my $k=$i;$k<=$i+$j;$k++) {
-                        my @WORD_FACTOR = split(/\|/,$WORD[$k]);
-                        for(my $f=0;$f<=$#FACTOR;$f++) {
-                            $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
-                        }
-                        chop($phrase);
-                        $phrase .= " ";
-                    }
-                    chop($phrase);
-                    $PHRASE_USED{$_}{$phrase}++;
-                }
-            }
-        }
-    }
-    close(INPUT);
-}
-
-# filter files
-print STDERR "Filtering files...\n";
-for(my $i=0;$i<=$#TABLE;$i++) {
-    my ($used,$total) = (0,0);
-    my $file = $TABLE[$i];
-    my $factors = $TABLE_FACTORS[$i];
-    my $new_file = $TABLE_NEW_NAME[$i];
-    print STDERR "filtering $file -> $new_file...\n";
-    my $mid_file = $new_file; # used when both filtering and binarizing
-    if (!$opt_filter) {
-      # check if original file was gzipped
-      if ($file !~ /\.gz$/ && -e "$file.gz") {
-        $file .= ".gz";
-      }
-      $mid_file .= ".gz" if $file =~ /\.gz$/;
-      $cmd = "ln -s $file $mid_file";
-      safesystem($cmd) or die "Failed to make symlink";
-    } else {
-
-      $mid_file .= ".gz"
-        if $mid_file !~ /\.gz/
-           && $binarizer && $binarizer =~ /processPhraseTable/;
-
-      my $openstring = mk_open_string($file);
-
-      my $mid_openstring;
-      if ($mid_file =~ /\.gz$/) {
-        $mid_openstring = "| gzip -c > $mid_file";
-      } else {
-        $mid_openstring = ">$mid_file";
-      }
-
-
-      open(FILE_OUT,$mid_openstring) or die "Can't write to $mid_openstring";
-
-      if ($opt_hierarchical) {
-          my $input_file = $opt_strip_xml ? $TMP_INPUT_FILENAME{$factors} : $input;
-          $cmd = "$openstring $syntax_filter_cmd $input_file |";
-          print STDERR "Executing: $cmd\n";
-          open(PIPE,$cmd);
-          while (my $line = <PIPE>) {
-              print FILE_OUT $line
-          }
-          close(FILEHANDLE);
-      } else {
-          open(FILE,$openstring) or die "Can't open '$openstring'";
-          while(my $entry = <FILE>) {
-              my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
-              $foreign =~ s/ $//;
-              if (defined($PHRASE_USED{$factors}{$foreign})) {
-                  # handle min_score thresholds
-                  if ($min_score) {
-                     my @ITEM = split(/ *\|\|\| */,$rest);
-                     if(scalar (@ITEM)>2) { # do not filter reordering table
-                       my @SCORE = split(/ /,$ITEM[1]);
-                       my $okay = 1;
-                       foreach my $id (keys %MIN_SCORE) {
-                         $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id};
-                       }
-                       next unless $okay;
-                     }
-                  }
-                  print FILE_OUT $entry;
-                  $used++;
-              }
-              $total++;
-          }
-          close(FILE);
-          die "No phrases found in $file!" if $total == 0;
-          printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
-      }
-
-      close(FILE_OUT);
-
-    }
-
-    my $catcmd = ($mid_file =~ /\.gz$/ ? "$ZCAT" : "cat");
-    if(defined($binarizer)) {
-      print STDERR "binarizing...\n";
-      # translation model
-      if ($KNOWN_TTABLE{$i}) {
-        # ... hierarchical translation model
-        if ($opt_hierarchical) {
-          my $cmd = "$binarizer $mid_file $new_file.bin";
-          safesystem($cmd) or die "Can't binarize";
-        }
-        # ... phrase translation model
-        elsif ($binarizer =~ /processPhraseTableMin/) {
-          #compact phrase table
-          ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
-          my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None";
-          safesystem($cmd) or die "Can't binarize";
-        } elsif ($binarizer =~ /CreateOnDiskPt/) {
-      	  my $cmd = "$binarizer $mid_file $new_file.bin";
-          safesystem($cmd) or die "Can't binarize";
-        } else { 
-          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
-          safesystem($cmd) or die "Can't binarize";
-        }
-      }
-      # reordering model
-      else {
-        my $lexbin;
-        $lexbin = $binarizer;
-        if ($binarizer =~ /CreateOnDiskPt/) {
-          $lexbin =~ s/CreateOnDiskPt/processLexicalTable/;
-        }
-
-        $lexbin =~ s/PhraseTable/LexicalTable/;
-        my $cmd;
-        if ($lexbin =~ /processLexicalTableMin/) {
-          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
-        } else {
-          $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
-          $cmd = "$lexbin -in $mid_file -out $new_file";
-        }
-        safesystem($cmd) or die "Can't binarize";
-      }
-    }
-}
-
-# Remove any temporary input files
-unlink values %TMP_INPUT_FILENAME;
-
-open(INFO,">$dir/info");
-print INFO "$config\n$input\n";
-close(INFO);
-
-
-print "To run the decoder, please call:
-  moses -f $dir/moses.ini -i $input\n";
-
-# functions
-sub mk_open_string {
-  my $file = shift;
-  my $openstring;
-  if ($file !~ /\.gz$/ && -e "$file.gz") {
-    $openstring = "$ZCAT $file.gz |";
-  } elsif ($file =~ /\.gz$/) {
-    $openstring = "$ZCAT $file |";
-  } elsif ($opt_hierarchical) {
-    $openstring = "cat $file |";
-  } else {
-    $openstring = "< $file";
-  }
-  return $openstring;
-}
-
-
-sub safesystem {
-  print STDERR "Executing: @_\n";
-  system("bash", "-c", @_);
-  if ($? == -1) {
-      print STDERR "Failed to execute: @_\n  $!\n";
-      exit(1);
-  }
-  elsif ($? & 127) {
-      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
-          ($? & 127),  ($? & 128) ? 'with' : 'without';
-      exit(1);
-  }
-  else {
-    my $exitcode = $? >> 8;
-    print STDERR "Exit code: $exitcode\n" if $exitcode;
-    return ! $exitcode;
-  }
-}
-
-sub ensure_full_path {
-    my $PATH = shift;
-    return $PATH if $PATH =~ /^\//;
-    my $dir = `pawd 2>/dev/null`;
-    if (!$dir) {$dir = `pwd`;}
-    chomp $dir;
-    $PATH = $dir."/".$PATH;
-    $PATH =~ s/[\r\n]//g;
-    $PATH =~ s/\/\.\//\//g;
-    $PATH =~ s/\/+/\//g;
-    my $sanity = 0;
-    while($PATH =~ /\/\.\.\// && $sanity++<10) {
-        $PATH =~ s/\/+/\//g;
-        $PATH =~ s/\/[^\/]+\/\.\.\//\//g;
-    }
-    $PATH =~ s/\/[^\/]+\/\.\.$//;
-    $PATH =~ s/\/+$//;
-    return $PATH;
-}
-
-sub join_array {
-  my @outside = @{$_[0]};
-   
-  my $ret = "";
-  for (my $i = 0; $i < scalar(@outside); ++$i) {
-    my $tok = $outside[$i];    
-    $ret .= "$tok ";
-  }
-  
-  return $ret;
-}
-
-sub set_value {
-  my @arr = @{$_[0]};
-  my $keySought = $_[1];
-  my $newValue = $_[2];
-
-  for (my $i = 1; $i < scalar(@arr); ++$i) {
-    my @inside = split(/=/, $arr[$i]);
-
-		my $key = $inside[0];
-		if ($key eq $keySought) {
-		  $arr[$i] = "$key=$newValue";
-		  return @arr;
-		}
-  }
-  return @arr;
-}
-
-
-- 
cgit v1.2.3


From e1fcc8082a29762946b92e1b068551481fc2b743 Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Fri, 24 Apr 2015 19:30:40 +0200
Subject: use integer type when reading options instead of checking for undef.
 it's more elegant.

---
 scripts/training/filter-model-given-input.pl | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index 9373e44c1..84dbbe879 100644
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -36,7 +36,7 @@ my $ZCAT = "gzip -cd";
 # get optional parameters
 my $opt_hierarchical = 0;
 my $binarizer = undef;
-my $threads = undef; # Default is single-thread, i.e. $threads=1
+my $threads = 1; # Default is single-thread, i.e. $threads=1
 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical";
 my $min_score = undef;
 my $opt_min_non_initial_rule_count = undef;
@@ -54,7 +54,7 @@ GetOptions(
     "SyntaxFilterCmd=s" => \$syntax_filter_cmd,
     "tempdir=s" => \$tempdir,
     "MinScore=s" => \$min_score,
-    "threads" => \$threads,
+    "threads=i" => \$threads,
     "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count,  # DEPRECATED
 ) or exit(1);
 
@@ -407,9 +407,6 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         elsif ($binarizer =~ /processPhraseTableMin/) {
           #compact phrase table
           ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
-          if(!defined($threads)) {
-          	$threads = 1
-          }
           my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None";
           safesystem($cmd) or die "Can't binarize";
         } elsif ($binarizer =~ /CreateOnDiskPt/) {
@@ -431,9 +428,6 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         $lexbin =~ s/PhraseTable/LexicalTable/;
         my $cmd;
         if ($lexbin =~ /processLexicalTableMin/) {
-          if(!defined($threads)) {
-          	$threads = 1
-          }
           $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
         } else {
           $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
-- 
cgit v1.2.3


From c01b0a6262fbf92e1908ebc31c5f0894c489a2cf Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Sun, 26 Apr 2015 20:25:15 +0200
Subject: merging the filter-model-given-input.pl with alvations-master branch

---
 scripts/training/filter-model-given-input.pl | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index 7dec0762c..84dbbe879 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl 
+#!/usr/bin/perl -w
 
 # $Id$
 # Given a moses.ini file and an input text prepare minimized translation
@@ -8,7 +8,6 @@
 # changes by Ondrej Bojar
 # adapted for hierarchical models by Phil Williams
 
-use warnings;
 use strict;
 
 use FindBin qw($RealBin);
@@ -37,6 +36,7 @@ my $ZCAT = "gzip -cd";
 # get optional parameters
 my $opt_hierarchical = 0;
 my $binarizer = undef;
+my $threads = 1; # Default is single-thread, i.e. $threads=1
 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical";
 my $min_score = undef;
 my $opt_min_non_initial_rule_count = undef;
@@ -54,6 +54,7 @@ GetOptions(
     "SyntaxFilterCmd=s" => \$syntax_filter_cmd,
     "tempdir=s" => \$tempdir,
     "MinScore=s" => \$min_score,
+    "threads=i" => \$threads,
     "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count,  # DEPRECATED
 ) or exit(1);
 
@@ -63,7 +64,7 @@ my $config = shift;
 my $input = shift;
 
 if (!defined $dir || !defined $config || !defined $input) {
-  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n";
+  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd] [-threads num]\n";
   exit 1;
 }
 $dir = ensure_full_path($dir);
@@ -405,7 +406,8 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         # ... phrase translation model
         elsif ($binarizer =~ /processPhraseTableMin/) {
           #compact phrase table
-          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] && rm $mid_file.sorted";
+          ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
+          my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None";
           safesystem($cmd) or die "Can't binarize";
         } elsif ($binarizer =~ /CreateOnDiskPt/) {
       	  my $cmd = "$binarizer $mid_file $new_file.bin";
@@ -426,7 +428,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         $lexbin =~ s/PhraseTable/LexicalTable/;
         my $cmd;
         if ($lexbin =~ /processLexicalTableMin/) {
-          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file && rm $mid_file.sorted";
+          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
         } else {
           $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
           $cmd = "$lexbin -in $mid_file -out $new_file";
-- 
cgit v1.2.3


From ec54ea3c4fcdb055661dba1fe3003d6bb1a0bed8 Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Sun, 26 Apr 2015 20:30:15 +0200
Subject: put back some of the difference made after RELEASE3.0 and
 incorporated it with the -threads parameter

---
 scripts/training/filter-model-given-input.pl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index 84dbbe879..1464fdb73 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 
 
 # $Id$
 # Given a moses.ini file and an input text prepare minimized translation
@@ -8,6 +8,7 @@
 # changes by Ondrej Bojar
 # adapted for hierarchical models by Phil Williams
 
+use warnings;
 use strict;
 
 use FindBin qw($RealBin);
@@ -406,8 +407,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         # ... phrase translation model
         elsif ($binarizer =~ /processPhraseTableMin/) {
           #compact phrase table
-          ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
-          my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None";
+          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
           safesystem($cmd) or die "Can't binarize";
         } elsif ($binarizer =~ /CreateOnDiskPt/) {
       	  my $cmd = "$binarizer $mid_file $new_file.bin";
-- 
cgit v1.2.3


From 4a68c42b16626e2ee707e93a6453eda51dc807a1 Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Sun, 26 Apr 2015 20:37:10 +0200
Subject: syncing to latest moses version

---
 scripts/training/filter-model-given-input.pl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index 84dbbe879..1464fdb73 100644
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 
 
 # $Id$
 # Given a moses.ini file and an input text prepare minimized translation
@@ -8,6 +8,7 @@
 # changes by Ondrej Bojar
 # adapted for hierarchical models by Phil Williams
 
+use warnings;
 use strict;
 
 use FindBin qw($RealBin);
@@ -406,8 +407,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         # ... phrase translation model
         elsif ($binarizer =~ /processPhraseTableMin/) {
           #compact phrase table
-          ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
-          my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None";
+          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
           safesystem($cmd) or die "Can't binarize";
         } elsif ($binarizer =~ /CreateOnDiskPt/) {
       	  my $cmd = "$binarizer $mid_file $new_file.bin";
-- 
cgit v1.2.3


From da648fd65b7de9709b5cd3b094138f68a9584de0 Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Mon, 27 Apr 2015 10:52:16 +0100
Subject: fix some RDLM training options

---
 scripts/training/rdlm/README                      |  4 ++--
 scripts/training/rdlm/extract_syntactic_ngrams.py |  5 +++--
 scripts/training/rdlm/extract_vocab.py            |  4 ----
 scripts/training/rdlm/train_rdlm.py               | 17 +++++++++--------
 4 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'scripts')

diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README
index 209daf1c0..ca2a06015 100644
--- a/scripts/training/rdlm/README
+++ b/scripts/training/rdlm/README
@@ -31,8 +31,8 @@ RDLM is split into two neural network models, which can be trained with
 
   mkdir working_dir_head
   mkdir working_dir_label
-  ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head  --output-dir /path/to/output_directory --output-model rdlm_head  --mode head  --output-vocab-size 500000 --noise-samples 100
-  ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50
+  ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_head  --output-dir /path/to/output_directory --output-model rdlm_head  --mode head  --output-vocab-size 500000 --noise 100
+  ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise 50
 
 for more options, run `train_rdlm.py --help`. Parameters you may want to adjust
 include the vocabulary size of the label model (depending on the number of
diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
index eca1b3a49..f3ce41080 100755
--- a/scripts/training/rdlm/extract_syntactic_ngrams.py
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -113,13 +113,14 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
             int_list.extend(parent_heads)
             int_list.extend(parent_labels)
 
+            # write root of tree
             if options.mode == 'label':
                 int_list.append(output_vocab.get(label, 0))
-                sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+                options.output.write(' '.join(map(str, int_list)) + '\n')
             elif options.mode == 'head' and not head == '<dummy_head>':
                 int_list.append(vocab.get(label, 0))
                 int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
-                sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+                options.output.write(' '.join(map(str, int_list)) + '\n')
 
             parent_heads.append(vocab.get(head, 0))
             parent_labels.append(vocab.get(label, 0))
diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py
index 55ecbe554..6d017602e 100755
--- a/scripts/training/rdlm/extract_vocab.py
+++ b/scripts/training/rdlm/extract_vocab.py
@@ -59,10 +59,6 @@ def get_head(xml, args):
             preterminal = child.get('label')
             head = escape_text(child.text.strip())
 
-        # hack for split compounds
-        elif child[-1].get('label') == 'SEGMENT':
-            return escape_text(child[-1].text.strip()), 'SEGMENT'
-
         elif args.ptkvz and head and child.get('label') == 'avz':
             for grandchild in child:
                 if grandchild.get('label') == 'PTKVZ':
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
index 1e7ecac52..15e56c430 100755
--- a/scripts/training/rdlm/train_rdlm.py
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -43,7 +43,7 @@ parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar
 parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)")
 parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)")
 parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)")
-parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
+parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
 
 
 parser.set_defaults(
@@ -95,7 +95,7 @@ def prepare_vocabulary(options):
       filtered_vocab = open(orig).readlines()
       orig = vocab_prefix + '.nonterminals'
       filtered_vocab += open(orig).readlines()
-      filtered_vocab = [word for word in filtered_vocab if not word.startswith(prefix) for prefix in blacklist]
+      filtered_vocab = [word for word in filtered_vocab if not any(word.startswith(prefix) for prefix in blacklist)]
       if options.output_vocab_size:
         filtered_vocab = filtered_vocab[:options.output_vocab_size]
     else:
@@ -127,12 +127,13 @@ def main(options):
   sys.stderr.write('extracting syntactic n-grams\n')
   extract_syntactic_ngrams.main(extract_options)
 
-  if validation_corpus:
-    extract_options.input = options.validation_corpus
-    options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized')
-    extract_options.output = options.validation_file
+  if options.validation_corpus:
+    extract_options.input = open(options.validation_corpus)
+    options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus))
+    extract_options.output = open(options.validation_file + '.numberized', 'w')
     sys.stderr.write('extracting syntactic n-grams (validation file)\n')
     extract_syntactic_ngrams.main(extract_options)
+    extract_options.output.close()
 
   sys.stderr.write('training neural network\n')
   train_nplm.main(options)
@@ -141,8 +142,8 @@ def main(options):
   ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'),
                    options.nplm_home,
                    os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
-                   os.path.join(options.working_dir, options.corpus_stem + '.numberized'),
-                   os.path.join(options.output_dir, options.output_model + '.model.nplm.')
+                   os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
+                   os.path.join(options.output_dir, options.output_model + '.model.nplm')
                    ])
   if ret:
       raise Exception("averaging null words failed")
-- 
cgit v1.2.3


From a47fc006359b68eea2fcc369fae983338226a925 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Mon, 27 Apr 2015 17:35:19 +0400
Subject: option to output factors

---
 scripts/training/wrappers/madamira-wrapper.perl | 57 +++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 4 deletions(-)

(limited to 'scripts')

diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl
index 6e7efe245..9866e6421 100755
--- a/scripts/training/wrappers/madamira-wrapper.perl
+++ b/scripts/training/wrappers/madamira-wrapper.perl
@@ -8,21 +8,32 @@ use File::Basename;
 use FindBin qw($RealBin);
 use Cwd 'abs_path';
 
+sub GetFactors;
+
+
 my $TMPDIR = "tmp";
 my $SCHEME = "D2";
 my $KEEP_TMP = 0;
 my $MADA_DIR;
 
+my $FACTORS_STR;
+my @FACTORS;
+
 GetOptions(
   "scheme=s" => \$SCHEME,
   "tmpdir=s" => \$TMPDIR,
   "keep-tmp" => \$KEEP_TMP,
-  "mada-dir=s" => \$MADA_DIR
+  "mada-dir=s" => \$MADA_DIR,
+  "factors=s" => \$FACTORS_STR
     ) or die("ERROR: unknown options");
 
 $TMPDIR = abs_path($TMPDIR);
 print STDERR "TMPDIR=$TMPDIR \n";
 
+if (defined($FACTORS_STR)) {
+    @FACTORS = split(",", $FACTORS_STR);
+}
+
 #binmode(STDIN, ":utf8");
 #binmode(STDOUT, ":utf8");
 
@@ -75,13 +86,21 @@ while(my $line = <MADA_OUT>) {
 	print "\n";
     }
     elsif (index($line, ";;WORD") == 0) {
-    # word
+        # word
 	my $word = substr($line, 7, length($line) - 8);
-    #print STDERR "FOund $word\n";
+        #print STDERR "FOund $word\n";
+	
+	for (my $i = 0; $i < 4; ++$i) {
+	    $line = <MADA_OUT>;
+	}
+	
+	my $factors = GetFactors($line, \@FACTORS);
+	$word .= $factors;
+
 	print "$word ";
     }
     else {
-    #print STDERR "NADA\n";
+      #print STDERR "NADA\n";
     }
 }
 close (MADA_OUT);
@@ -91,3 +110,33 @@ if ($KEEP_TMP == 0) {
 #    `rm -rf $TMPDIR`;
 }
 
+
+###########################
+sub GetFactors
+{
+    my $line = shift;
+    my $factorsRef = shift;
+    my @factors = @{$factorsRef};
+
+    # all factors
+    my %allFactors;
+    my @toks = split(" ", $line);
+    for (my $i = 1; $i < scalar(@toks); ++$i) {
+	#print " tok=" .$toks[$i];
+
+        my ($key, $value) = split(":", $toks[$i]);
+	$allFactors{$key} = $value;
+    }
+
+    my $ret = "";
+    my $factorType;
+    foreach $factorType(@factors) {
+	#print "factorType=$factorType ";
+	my $value = $allFactors{$factorType};
+
+	$ret .= "|$value";
+    }
+    
+    return $ret;
+}
+
-- 
cgit v1.2.3


From 8adad4fc2e1ae609ffbd8fe76261540cac19a125 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Mon, 27 Apr 2015 17:39:49 +0400
Subject: exec permission

---
 scripts/training/filter-model-given-input.pl | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/training/filter-model-given-input.pl

(limited to 'scripts')

diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
old mode 100644
new mode 100755
-- 
cgit v1.2.3


From b7792b227a337c36d97d3c0979d11e6955ba368c Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Tue, 28 Apr 2015 12:29:58 +0400
Subject: script to convert arabic to bw, and vice versa

---
 scripts/other/buckwalter.perl | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100755 scripts/other/buckwalter.perl

(limited to 'scripts')

diff --git a/scripts/other/buckwalter.perl b/scripts/other/buckwalter.perl
new file mode 100755
index 000000000..62544e212
--- /dev/null
+++ b/scripts/other/buckwalter.perl
@@ -0,0 +1,33 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Encode::Arabic::Buckwalter;
+use Getopt::Long "GetOptions";
+
+my $direction;
+GetOptions('direction=i' => \$direction)
+    or exit(1);
+# direction: 1=arabic->bw, 2=bw->arabic
+
+die("ERROR: need to set direction") unless defined($direction);
+
+
+
+while (my $line = <STDIN>) {
+    chomp($line);
+
+    my $lineOut;
+    if ($direction == 1) {
+      $lineOut =  encode 'buckwalter', decode 'utf8', $line;
+    }
+    elsif ($direction == 2) {
+      $lineOut =  encode 'utf8', decode 'buckwalter', $line;
+    }
+    else {
+	die("Unknown direction: $direction");
+    }
+    print "$lineOut\n";
+
+}
+
-- 
cgit v1.2.3


From 8f9bf7ea386feb1aef5413730bd627a1161c5928 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Tue, 28 Apr 2015 15:03:59 +0400
Subject: add -config

---
 scripts/training/wrappers/madamira-wrapper.perl | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'scripts')

diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl
index 9866e6421..5c1d0404f 100755
--- a/scripts/training/wrappers/madamira-wrapper.perl
+++ b/scripts/training/wrappers/madamira-wrapper.perl
@@ -15,6 +15,7 @@ my $TMPDIR = "tmp";
 my $SCHEME = "D2";
 my $KEEP_TMP = 0;
 my $MADA_DIR;
+my $CONFIG;
 
 my $FACTORS_STR;
 my @FACTORS;
@@ -24,9 +25,14 @@ GetOptions(
   "tmpdir=s" => \$TMPDIR,
   "keep-tmp" => \$KEEP_TMP,
   "mada-dir=s" => \$MADA_DIR,
-  "factors=s" => \$FACTORS_STR
+  "factors=s" => \$FACTORS_STR,
+  "config=s" => \$CONFIG
     ) or die("ERROR: unknown options");
 
+if (!defined($CONFIG)) {
+  $CONFIG = "$MADA_DIR/samples/sampleConfigFile.xml";
+}
+
 $TMPDIR = abs_path($TMPDIR);
 print STDERR "TMPDIR=$TMPDIR \n";
 
@@ -65,7 +71,7 @@ else {
 $cmd = "$SPLIT_EXEC -l 10000 -a 7 -d  $TMPDIR/input $TMPDIR/split/x";
 `$cmd`;
 
-$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir  $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml  ::: $TMPDIR/split/x*";
+$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir  $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*";
 print STDERR "Executing: $cmd\n";
 `$cmd`;
 
@@ -77,7 +83,7 @@ print STDERR "Executing: $cmd\n";
 open(MADA_OUT,"<$infile.mada");
 #binmode(MADA_OUT, ":utf8");
 while(my $line = <MADA_OUT>) { 
-    chop($line);
+    chomp($line);
   #print STDERR "line=$line \n";
 
     if (index($line, "SENTENCE BREAK") == 0) {
-- 
cgit v1.2.3