From 5536e13213fe518039e08fae30d1a160e457717f Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 20 Mar 2015 18:44:59 +0100 Subject: added Gacha Filter from WMT14 --- scripts/tokenizer/pre_tokenize_cleaning.py | 78 ++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 scripts/tokenizer/pre_tokenize_cleaning.py (limited to 'scripts') diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py new file mode 100644 index 000000000..76736da5c --- /dev/null +++ b/scripts/tokenizer/pre_tokenize_cleaning.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python -*- coding: utf-8 -*- + +""" +The Gacha filter cleans out sentence pairs that have global character mean +lower than a certain threshold. + +Use this cleaner to produce low quantity of high quality sentence pairs. + +It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during +WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER. +(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf) + +This is inspired by the global character mean that is used in the Gale-Church +algorithm (Gale aand Church, 1993), the c variable in: + + delta = (l2-l1*c)/math.sqrt(l1*s2) + +where: + - l1 = len(source_sentence) + - l2 = len(target_sentence) + - c = global mean, i.e. #char in source corpus / #char in target corpus + - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1) + +(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf) +""" + +import io, subprocess + +red = '\033[01;31m' +native = '\033[m' + +def err_msg(txt): + return red+txt+native + +def num_char(filename): + return float(subprocess.Popen(["wc", "-m", filename], + stdout=subprocess.PIPE).stdout.read().split()[0]) + +def gacha_mean(sourcefile, targetfile): + """ + Counts the global character mean between source and target language as + in Gale-Church (1993) + """ + sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n')) + c = num_char(sourcefile) / num_char(targetfile) + sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n')) + sys.stderr.write(err_msg('Filtering starts ...\n')) + return c + +def main(sourcefile, targetfile, threshold=0.2): + # Calculates Gacha mean. + c = gacha_mean(sourcefile, targetfile) + # Calculates lower and upperbound for filtering + threshold = float(threshold) + lowerbound = (1-threshold) * c + upperbound = (1+threshold) * c + + # Start filtering sentences. + with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \ + io.open(targetfile, 'r', encoding='utf8') as trgfin: + for s, t in zip(srcfin, trgfin): + if lowerbound < len(s) / float(len(t)) < upperbound: + print(u"{}\t{}\n".format(s.strip(),t.strip())) + +if __name__ == '__main__': + import sys + if len(sys.argv) not in range(3,5): + usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n' + % sys.argv[0]) + + example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de ' + '~/Europarl.de-en.en 0.4\n' + % sys.argv[0]) + sys.stderr.write(usage_msg) + sys.stderr.write(example_msg) + sys.exit(1) + + main(*sys.argv[1:]) -- cgit v1.2.3 From 40933b4a782d43b9e07a89e62c4c6ffd1b12d1c5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 22 Apr 2015 19:01:12 +0400 Subject: hack to allow target side of tokenized parallel corpus to be used for LM --- scripts/ems/experiment.meta | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'scripts') diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index ead9ebe03..62f85eb1c 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -165,11 +165,18 @@ get-corpus pass-unless: get-corpus-script default-name: lm/txt template: $get-corpus-script > OUT +use-parallel-corpus + in: parallel-corpus-stem + out: tokenized-corpus + default-name: lm/tok + pass-unless: parallel-corpus-stem + template: ln -s IN.$output-extension OUT tokenize in: raw-corpus out: tokenized-corpus default-name: lm/tok pass-unless: output-tokenizer + ignore-if: parallel-corpus-stem template: $output-tokenizer < IN > OUT parallelizable: yes mock-parse -- cgit v1.2.3 From 4b47e1148c7cfe771c8e813cb9d741c2de44ed42 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 22 Apr 2015 23:02:57 +0400 Subject: use ignore-unless /Philipp Koehn --- scripts/ems/experiment.meta | 2 +- scripts/training/train-model.perl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'scripts') diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 62f85eb1c..57ef4f9d6 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -169,7 +169,7 @@ use-parallel-corpus in: parallel-corpus-stem out: tokenized-corpus default-name: lm/tok - pass-unless: parallel-corpus-stem + ignore-unless: parallel-corpus-stem template: ln -s IN.$output-extension OUT tokenize in: raw-corpus diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index fb63d4bbd..4c355479c 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl use warnings; use strict; -- cgit v1.2.3 From 585784f62aff40b1c81256a55db8fd97408de31c Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 24 Apr 2015 18:57:28 +0200 Subject: added thread options for filter-model-given-input.pl --- scripts/training/filter-model-given-input.pl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) mode change 100755 => 100644 scripts/training/filter-model-given-input.pl (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl old mode 100755 new mode 100644 index 3ce426c39..d238d7dcb --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -36,6 +36,7 @@ my $ZCAT = "gzip -cd"; # get optional parameters my $opt_hierarchical = 0; my $binarizer = undef; +my $threads = 1; # Default is single-thread my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; my $min_score = undef; my $opt_min_non_initial_rule_count = undef; @@ -53,6 +54,7 @@ GetOptions( "SyntaxFilterCmd=s" => \$syntax_filter_cmd, "tempdir=s" => \$tempdir, "MinScore=s" => \$min_score, + "threads" => \$threads, "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count, # DEPRECATED ) or exit(1); @@ -404,8 +406,8 @@ for(my $i=0;$i<=$#TABLE;$i++) { # ... phrase translation model elsif ($binarizer =~ /processPhraseTableMin/) { #compact phrase table - ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] && rm $mid_file.sorted"; - my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -encoding None"; + ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; + my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None"; safesystem($cmd) or die "Can't binarize"; } elsif ($binarizer =~ /CreateOnDiskPt/) { my $cmd = "$binarizer $mid_file $new_file.bin"; @@ -426,7 +428,7 @@ for(my $i=0;$i<=$#TABLE;$i++) { $lexbin =~ s/PhraseTable/LexicalTable/; my $cmd; if ($lexbin =~ /processLexicalTableMin/) { - $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file && rm $mid_file.sorted"; + $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted"; } else { $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options $cmd = "$lexbin -in $mid_file -out $new_file"; -- cgit v1.2.3 From 6c63ca963c4d6f9455445df59ba51593888ef705 Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 24 Apr 2015 19:06:37 +0200 Subject: checks for undefined $threads --- scripts/training/filter-model-given-input.pl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index d238d7dcb..2ce1f26a2 100644 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -36,7 +36,7 @@ my $ZCAT = "gzip -cd"; # get optional parameters my $opt_hierarchical = 0; my $binarizer = undef; -my $threads = 1; # Default is single-thread +my $threads = undef; # Default is single-thread my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; my $min_score = undef; my $opt_min_non_initial_rule_count = undef; @@ -407,6 +407,9 @@ for(my $i=0;$i<=$#TABLE;$i++) { elsif ($binarizer =~ /processPhraseTableMin/) { #compact phrase table ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; + if(!defined($threads)) { + $thread = 1 + } my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None"; safesystem($cmd) or die "Can't binarize"; } elsif ($binarizer =~ /CreateOnDiskPt/) { @@ -428,6 +431,9 @@ for(my $i=0;$i<=$#TABLE;$i++) { $lexbin =~ s/PhraseTable/LexicalTable/; my $cmd; if ($lexbin =~ /processLexicalTableMin/) { + if(!defined($threads)) { + $thread = 1 + } $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted"; } else { $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options -- cgit v1.2.3 From aa9207acfc1e8746d3bbb1626472f8906c7c7fa9 Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 24 Apr 2015 19:10:10 +0200 Subject: fixed typo in $thread to $threads --- scripts/training/filter-model-given-input.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index 2ce1f26a2..d015d3762 100644 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -408,7 +408,7 @@ for(my $i=0;$i<=$#TABLE;$i++) { #compact phrase table ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; if(!defined($threads)) { - $thread = 1 + $threads = 1 } my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None"; safesystem($cmd) or die "Can't binarize"; @@ -432,7 +432,7 @@ for(my $i=0;$i<=$#TABLE;$i++) { my $cmd; if ($lexbin =~ /processLexicalTableMin/) { if(!defined($threads)) { - $thread = 1 + $threads = 1 } $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted"; } else { -- cgit v1.2.3 From 0ccbcaece6133e68782154c9b23c50f2393b10df Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 24 Apr 2015 19:11:57 +0200 Subject: added $threads option in usage example --- scripts/training/filter-model-given-input-new.pl | 537 +++++++++++++++++++++++ scripts/training/filter-model-given-input.pl | 4 +- 2 files changed, 539 insertions(+), 2 deletions(-) create mode 100644 scripts/training/filter-model-given-input-new.pl (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input-new.pl b/scripts/training/filter-model-given-input-new.pl new file mode 100644 index 000000000..029d83ed0 --- /dev/null +++ b/scripts/training/filter-model-given-input-new.pl @@ -0,0 +1,537 @@ +#!/usr/bin/perl -w + +# $Id$ +# Given a moses.ini file and an input text prepare minimized translation +# tables and a new moses.ini, so that loading of tables is much faster. + +# original code by Philipp Koehn +# changes by Ondrej Bojar +# adapted for hierarchical models by Phil Williams + +use strict; + +use FindBin qw($RealBin); +use Getopt::Long; + +my $SCRIPTS_ROOTDIR; +if (defined($ENV{"SCRIPTS_ROOTDIR"})) { + $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"}; +} else { + $SCRIPTS_ROOTDIR = $RealBin; + if ($SCRIPTS_ROOTDIR eq '') { + $SCRIPTS_ROOTDIR = dirname(__FILE__); + } + $SCRIPTS_ROOTDIR =~ s/\/training$//; + $ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR; +} + +# consider phrases in input up to $MAX_LENGTH +# in other words, all phrase-tables will be truncated at least to 10 words per +# phrase. +my $MAX_LENGTH = 10; + +# utilities +my $ZCAT = "gzip -cd"; + +# get optional parameters +my $opt_hierarchical = 0; +my $binarizer = undef; +my $threads = undef; # Default is single-thread +my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; +my $min_score = undef; +my $opt_min_non_initial_rule_count = undef; +my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats) +my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice +my $opt_strip_xml = 1; # disabling XML stripping is required for STSG models where the input is a tree or forest +my $tempdir = undef; + +GetOptions( + "gzip!" => \$opt_gzip, + "filter!" => \$opt_filter, + "Hierarchical" => \$opt_hierarchical, + "Binarizer=s" => \$binarizer, + "StripXml!" => \$opt_strip_xml, + "SyntaxFilterCmd=s" => \$syntax_filter_cmd, + "tempdir=s" => \$tempdir, + "MinScore=s" => \$min_score, + "threads" => \$threads, + "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count, # DEPRECATED +) or exit(1); + +# get command line parameters +my $dir = shift; +my $config = shift; +my $input = shift; + +if (!defined $dir || !defined $config || !defined $input) { + print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n"; + exit 1; +} +$dir = ensure_full_path($dir); + +# Warn if deprecated -MinNonInitialRuleCount option is used +if (defined($opt_min_non_initial_rule_count)) { + print STDERR "WARNING: -MinNonInitialRuleCount is deprecated; use score's -MinCountHierarchical option or set -SyntaxFilterCmd to \"$SCRIPTS_ROOTDIR/training/filter-rule-table.py --min-non-initial-rule=$opt_min_non_initial_rule_count\"\n"; +} + +$tempdir = $dir if !defined $tempdir; # use the working directory as temp by def. + +# decode min-score definitions +my %MIN_SCORE; +if ($min_score) { + foreach (split(/ *, */,$min_score)) { + my ($id,$score) = split(/ *: */); + $MIN_SCORE{$id} = $score; + print STDERR "score $id must be at least $score\n"; + } +} +# buggy directory in place? +if (-d $dir && ! -e "$dir/info") { + print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n"; + exit(1); +} + +# already filtered? check if it can be re-used +if (-d $dir) { + my @INFO = `cat $dir/info`; + chop(@INFO); + if($INFO[0] ne $config + || ($INFO[1] ne $input && + $INFO[1].".tagged" ne $input)) { + print STDERR "WARNING: directory exists but does not match parameters:\n"; + print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n"; + exit 1; + } + print STDERR "The filtered model was ready in $dir, not doing anything.\n"; + exit 0; +} + +# filter the translation and distortion tables +safesystem("mkdir -p $dir") or die "Can't mkdir $dir"; + +my $cmd; +if ($opt_strip_xml) { + my $inputStrippedXML = "$dir/input.$$"; + $cmd = "$RealBin/../generic/strip-xml.perl < $input > $inputStrippedXML"; + print STDERR "Stripping XML...\n"; + safesystem($cmd) or die "Can't strip XML"; + $input = $inputStrippedXML; +} + +# get tables to be filtered (and modify config file) +my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%KNOWN_TTABLE,@TABLE_WEIGHTS,%TABLE_NUMBER); + +my %new_name_used = (); +open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini"; +open(INI,$config) or die "Can't read $config"; +while(my $line = ) { + chomp($line); + my @toks = split(/ /, $line); + if ($line =~ /PhraseDictionaryMemory / + || $line =~ /PhraseDictionaryBinary / + || $line =~ /PhraseDictionaryOnDisk / + || $line =~ /PhraseDictionarySCFG / + || $line =~ /RuleTable / + ) { + print STDERR "pt:$line\n"; + + my ($phrase_table_impl,$source_factor,$t,$w,$file,$table_flag,$skip); # = ($1,$2,$3,$4,$5,$6,$7); + $table_flag = ""; + $phrase_table_impl = $toks[0]; + $skip = 0; + + for (my $i = 1; $i < scalar(@toks); ++$i) { + my @args = split(/=/, $toks[$i]); + chomp($args[0]); + chomp($args[1]); + + if ($args[0] eq "num-features") { + $w = $args[1]; + } + elsif ($args[0] eq "input-factor") { + $source_factor = $args[1]; + } + elsif ($args[0] eq "output-factor") { + $t = $args[1]; + } + elsif ($args[0] eq "path") { + $file = $args[1]; + } + elsif ($args[0] eq "filterable" && $args[1] eq "false") { + $skip = 1; + } + } #for (my $i = 1; $i < scalar(@toks); ++$i) { + + if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG" && $phrase_table_impl ne "RuleTable") || $file =~ /glue-grammar/ || $skip) { + # Only Memory ("0") and NewFormat ("6") can be filtered. + print INI_OUT "$line\n"; + next; + } + + push @TABLE, $file; + push @TABLE_WEIGHTS,$w; + $KNOWN_TTABLE{$#TABLE}++; + + my $new_name = "$dir/phrase-table.$source_factor-$t.".(++$TABLE_NUMBER{"$source_factor-$t"}); + my $cnt = 1; + $cnt ++ while (defined $new_name_used{"$new_name.$cnt"}); + $new_name .= ".$cnt"; + $new_name_used{$new_name} = 1; + if ($binarizer && $phrase_table_impl eq "PhraseDictionarySCFG") { + $phrase_table_impl = "PhraseDictionaryOnDisk"; + @toks = set_value(\@toks, "path", "$new_name.bin$table_flag"); + } + elsif ($binarizer && $phrase_table_impl eq "PhraseDictionaryMemory") { + if ($binarizer =~ /processPhraseTableMin/) { + $phrase_table_impl = "PhraseDictionaryCompact"; + @toks = set_value(\@toks, "path", "$new_name$table_flag"); + } + elsif ($binarizer =~ /CreateOnDiskPt/) { + $phrase_table_impl = "PhraseDictionaryOnDisk"; + @toks = set_value(\@toks, "path", "$new_name.bin$table_flag"); + } + else { + $phrase_table_impl = "PhraseDictionaryBinary"; + @toks = set_value(\@toks, "path", "$new_name$table_flag"); + } + } + else { + $new_name .= ".gz" if $opt_gzip; + @toks = set_value(\@toks, "path", "$new_name$table_flag"); + } + + $toks[0] = $phrase_table_impl; + + print INI_OUT join_array(\@toks)."\n"; + + push @TABLE_NEW_NAME,$new_name; + + $CONSIDER_FACTORS{$source_factor} = 1; + print STDERR "Considering factor $source_factor\n"; + push @TABLE_FACTORS, $source_factor; + + } #if (/PhraseModel /) { + elsif ($line =~ /LexicalReordering /) { + print STDERR "ro:$line\n"; + my ($source_factor, $t, $w, $file); # = ($1,$2,$3,$4); + + for (my $i = 1; $i < scalar(@toks); ++$i) { + my @args = split(/=/, $toks[$i]); + chomp($args[0]); + chomp($args[1]); + + if ($args[0] eq "num-features") { + $w = $args[1]; + } + elsif ($args[0] eq "input-factor") { + $source_factor = chomp($args[1]); + } + elsif ($args[0] eq "output-factor") { + #$t = chomp($args[1]); + } + elsif ($args[0] eq "type") { + $t = $args[1]; + } + elsif ($args[0] eq "path") { + $file = $args[1]; + } + + } # for (my $i = 1; $i < scalar(@toks); ++$i) { + + push @TABLE, $file; + push @TABLE_WEIGHTS,$w; + + $file =~ s/^.*\/+([^\/]+)/$1/g; + my $new_name = "$dir/$file"; + $new_name =~ s/\.gz//; + + #print INI_OUT "$source_factor $t $w $new_name\n"; + @toks = set_value(\@toks, "path", "$new_name"); + print INI_OUT join_array(\@toks)."\n"; + + push @TABLE_NEW_NAME,$new_name; + + $CONSIDER_FACTORS{$source_factor} = 1; + print STDERR "Considering factor $source_factor\n"; + push @TABLE_FACTORS,$source_factor; + + + } #elsif (/LexicalReordering /) { + else { + print INI_OUT "$line\n"; + } +} # while() { +close(INI); +close(INI_OUT); + +my %TMP_INPUT_FILENAME; + +if ($opt_hierarchical) { + if (!$opt_strip_xml) { + print STDERR "WARNING: source factor reduction is disabled due to use of -noStripXML option\n"; + } else { + # Write a separate, temporary input file for each combination of source + # factors + foreach my $key (keys %CONSIDER_FACTORS) { + my $filename = "$dir/input-$key"; + open(FILEHANDLE,">$filename") or die "Can't open $filename for writing"; + $TMP_INPUT_FILENAME{$key} = $filename; + my @FACTOR = split(/,/, $key); + my $cmd = "$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |"; + print STDERR "Executing: $cmd\n"; + open(PIPE,$cmd); + while (my $line = ) { + print FILEHANDLE $line + } + close(FILEHANDLE); + } + } +} + +my %PHRASE_USED; +if ($opt_filter && !$opt_hierarchical) { + # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH + open(INPUT,mk_open_string($input)) or die "Can't read $input"; + while(my $line = ) { + chomp($line); + my @WORD = split(/ +/,$line); + for(my $i=0;$i<=$#WORD;$i++) { + for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) { + foreach (keys %CONSIDER_FACTORS) { + my @FACTOR = split(/,/); + my $phrase = ""; + for(my $k=$i;$k<=$i+$j;$k++) { + my @WORD_FACTOR = split(/\|/,$WORD[$k]); + for(my $f=0;$f<=$#FACTOR;$f++) { + $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|"; + } + chop($phrase); + $phrase .= " "; + } + chop($phrase); + $PHRASE_USED{$_}{$phrase}++; + } + } + } + } + close(INPUT); +} + +# filter files +print STDERR "Filtering files...\n"; +for(my $i=0;$i<=$#TABLE;$i++) { + my ($used,$total) = (0,0); + my $file = $TABLE[$i]; + my $factors = $TABLE_FACTORS[$i]; + my $new_file = $TABLE_NEW_NAME[$i]; + print STDERR "filtering $file -> $new_file...\n"; + my $mid_file = $new_file; # used when both filtering and binarizing + if (!$opt_filter) { + # check if original file was gzipped + if ($file !~ /\.gz$/ && -e "$file.gz") { + $file .= ".gz"; + } + $mid_file .= ".gz" if $file =~ /\.gz$/; + $cmd = "ln -s $file $mid_file"; + safesystem($cmd) or die "Failed to make symlink"; + } else { + + $mid_file .= ".gz" + if $mid_file !~ /\.gz/ + && $binarizer && $binarizer =~ /processPhraseTable/; + + my $openstring = mk_open_string($file); + + my $mid_openstring; + if ($mid_file =~ /\.gz$/) { + $mid_openstring = "| gzip -c > $mid_file"; + } else { + $mid_openstring = ">$mid_file"; + } + + + open(FILE_OUT,$mid_openstring) or die "Can't write to $mid_openstring"; + + if ($opt_hierarchical) { + my $input_file = $opt_strip_xml ? $TMP_INPUT_FILENAME{$factors} : $input; + $cmd = "$openstring $syntax_filter_cmd $input_file |"; + print STDERR "Executing: $cmd\n"; + open(PIPE,$cmd); + while (my $line = ) { + print FILE_OUT $line + } + close(FILEHANDLE); + } else { + open(FILE,$openstring) or die "Can't open '$openstring'"; + while(my $entry = ) { + my ($foreign,$rest) = split(/ \|\|\| /,$entry,2); + $foreign =~ s/ $//; + if (defined($PHRASE_USED{$factors}{$foreign})) { + # handle min_score thresholds + if ($min_score) { + my @ITEM = split(/ *\|\|\| */,$rest); + if(scalar (@ITEM)>2) { # do not filter reordering table + my @SCORE = split(/ /,$ITEM[1]); + my $okay = 1; + foreach my $id (keys %MIN_SCORE) { + $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id}; + } + next unless $okay; + } + } + print FILE_OUT $entry; + $used++; + } + $total++; + } + close(FILE); + die "No phrases found in $file!" if $total == 0; + printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; + } + + close(FILE_OUT); + + } + + my $catcmd = ($mid_file =~ /\.gz$/ ? "$ZCAT" : "cat"); + if(defined($binarizer)) { + print STDERR "binarizing...\n"; + # translation model + if ($KNOWN_TTABLE{$i}) { + # ... hierarchical translation model + if ($opt_hierarchical) { + my $cmd = "$binarizer $mid_file $new_file.bin"; + safesystem($cmd) or die "Can't binarize"; + } + # ... phrase translation model + elsif ($binarizer =~ /processPhraseTableMin/) { + #compact phrase table + ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; + my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None"; + safesystem($cmd) or die "Can't binarize"; + } elsif ($binarizer =~ /CreateOnDiskPt/) { + my $cmd = "$binarizer $mid_file $new_file.bin"; + safesystem($cmd) or die "Can't binarize"; + } else { + my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file"; + safesystem($cmd) or die "Can't binarize"; + } + } + # reordering model + else { + my $lexbin; + $lexbin = $binarizer; + if ($binarizer =~ /CreateOnDiskPt/) { + $lexbin =~ s/CreateOnDiskPt/processLexicalTable/; + } + + $lexbin =~ s/PhraseTable/LexicalTable/; + my $cmd; + if ($lexbin =~ /processLexicalTableMin/) { + $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted"; + } else { + $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options + $cmd = "$lexbin -in $mid_file -out $new_file"; + } + safesystem($cmd) or die "Can't binarize"; + } + } +} + +# Remove any temporary input files +unlink values %TMP_INPUT_FILENAME; + +open(INFO,">$dir/info"); +print INFO "$config\n$input\n"; +close(INFO); + + +print "To run the decoder, please call: + moses -f $dir/moses.ini -i $input\n"; + +# functions +sub mk_open_string { + my $file = shift; + my $openstring; + if ($file !~ /\.gz$/ && -e "$file.gz") { + $openstring = "$ZCAT $file.gz |"; + } elsif ($file =~ /\.gz$/) { + $openstring = "$ZCAT $file |"; + } elsif ($opt_hierarchical) { + $openstring = "cat $file |"; + } else { + $openstring = "< $file"; + } + return $openstring; +} + + +sub safesystem { + print STDERR "Executing: @_\n"; + system("bash", "-c", @_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + +sub ensure_full_path { + my $PATH = shift; + return $PATH if $PATH =~ /^\//; + my $dir = `pawd 2>/dev/null`; + if (!$dir) {$dir = `pwd`;} + chomp $dir; + $PATH = $dir."/".$PATH; + $PATH =~ s/[\r\n]//g; + $PATH =~ s/\/\.\//\//g; + $PATH =~ s/\/+/\//g; + my $sanity = 0; + while($PATH =~ /\/\.\.\// && $sanity++<10) { + $PATH =~ s/\/+/\//g; + $PATH =~ s/\/[^\/]+\/\.\.\//\//g; + } + $PATH =~ s/\/[^\/]+\/\.\.$//; + $PATH =~ s/\/+$//; + return $PATH; +} + +sub join_array { + my @outside = @{$_[0]}; + + my $ret = ""; + for (my $i = 0; $i < scalar(@outside); ++$i) { + my $tok = $outside[$i]; + $ret .= "$tok "; + } + + return $ret; +} + +sub set_value { + my @arr = @{$_[0]}; + my $keySought = $_[1]; + my $newValue = $_[2]; + + for (my $i = 1; $i < scalar(@arr); ++$i) { + my @inside = split(/=/, $arr[$i]); + + my $key = $inside[0]; + if ($key eq $keySought) { + $arr[$i] = "$key=$newValue"; + return @arr; + } + } + return @arr; +} + + diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index d015d3762..9373e44c1 100644 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -36,7 +36,7 @@ my $ZCAT = "gzip -cd"; # get optional parameters my $opt_hierarchical = 0; my $binarizer = undef; -my $threads = undef; # Default is single-thread +my $threads = undef; # Default is single-thread, i.e. $threads=1 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; my $min_score = undef; my $opt_min_non_initial_rule_count = undef; @@ -64,7 +64,7 @@ my $config = shift; my $input = shift; if (!defined $dir || !defined $config || !defined $input) { - print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n"; + print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd] [-threads num]\n"; exit 1; } $dir = ensure_full_path($dir); -- cgit v1.2.3 From d453ccc9f5ac88fb1b346f622dd76512077181c7 Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 24 Apr 2015 19:14:04 +0200 Subject: removed wrongly added perl script... --- scripts/training/filter-model-given-input-new.pl | 537 ----------------------- 1 file changed, 537 deletions(-) delete mode 100644 scripts/training/filter-model-given-input-new.pl (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input-new.pl b/scripts/training/filter-model-given-input-new.pl deleted file mode 100644 index 029d83ed0..000000000 --- a/scripts/training/filter-model-given-input-new.pl +++ /dev/null @@ -1,537 +0,0 @@ -#!/usr/bin/perl -w - -# $Id$ -# Given a moses.ini file and an input text prepare minimized translation -# tables and a new moses.ini, so that loading of tables is much faster. - -# original code by Philipp Koehn -# changes by Ondrej Bojar -# adapted for hierarchical models by Phil Williams - -use strict; - -use FindBin qw($RealBin); -use Getopt::Long; - -my $SCRIPTS_ROOTDIR; -if (defined($ENV{"SCRIPTS_ROOTDIR"})) { - $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"}; -} else { - $SCRIPTS_ROOTDIR = $RealBin; - if ($SCRIPTS_ROOTDIR eq '') { - $SCRIPTS_ROOTDIR = dirname(__FILE__); - } - $SCRIPTS_ROOTDIR =~ s/\/training$//; - $ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR; -} - -# consider phrases in input up to $MAX_LENGTH -# in other words, all phrase-tables will be truncated at least to 10 words per -# phrase. -my $MAX_LENGTH = 10; - -# utilities -my $ZCAT = "gzip -cd"; - -# get optional parameters -my $opt_hierarchical = 0; -my $binarizer = undef; -my $threads = undef; # Default is single-thread -my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; -my $min_score = undef; -my $opt_min_non_initial_rule_count = undef; -my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats) -my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice -my $opt_strip_xml = 1; # disabling XML stripping is required for STSG models where the input is a tree or forest -my $tempdir = undef; - -GetOptions( - "gzip!" => \$opt_gzip, - "filter!" => \$opt_filter, - "Hierarchical" => \$opt_hierarchical, - "Binarizer=s" => \$binarizer, - "StripXml!" => \$opt_strip_xml, - "SyntaxFilterCmd=s" => \$syntax_filter_cmd, - "tempdir=s" => \$tempdir, - "MinScore=s" => \$min_score, - "threads" => \$threads, - "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count, # DEPRECATED -) or exit(1); - -# get command line parameters -my $dir = shift; -my $config = shift; -my $input = shift; - -if (!defined $dir || !defined $config || !defined $input) { - print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n"; - exit 1; -} -$dir = ensure_full_path($dir); - -# Warn if deprecated -MinNonInitialRuleCount option is used -if (defined($opt_min_non_initial_rule_count)) { - print STDERR "WARNING: -MinNonInitialRuleCount is deprecated; use score's -MinCountHierarchical option or set -SyntaxFilterCmd to \"$SCRIPTS_ROOTDIR/training/filter-rule-table.py --min-non-initial-rule=$opt_min_non_initial_rule_count\"\n"; -} - -$tempdir = $dir if !defined $tempdir; # use the working directory as temp by def. - -# decode min-score definitions -my %MIN_SCORE; -if ($min_score) { - foreach (split(/ *, */,$min_score)) { - my ($id,$score) = split(/ *: */); - $MIN_SCORE{$id} = $score; - print STDERR "score $id must be at least $score\n"; - } -} -# buggy directory in place? -if (-d $dir && ! -e "$dir/info") { - print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n"; - exit(1); -} - -# already filtered? check if it can be re-used -if (-d $dir) { - my @INFO = `cat $dir/info`; - chop(@INFO); - if($INFO[0] ne $config - || ($INFO[1] ne $input && - $INFO[1].".tagged" ne $input)) { - print STDERR "WARNING: directory exists but does not match parameters:\n"; - print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n"; - exit 1; - } - print STDERR "The filtered model was ready in $dir, not doing anything.\n"; - exit 0; -} - -# filter the translation and distortion tables -safesystem("mkdir -p $dir") or die "Can't mkdir $dir"; - -my $cmd; -if ($opt_strip_xml) { - my $inputStrippedXML = "$dir/input.$$"; - $cmd = "$RealBin/../generic/strip-xml.perl < $input > $inputStrippedXML"; - print STDERR "Stripping XML...\n"; - safesystem($cmd) or die "Can't strip XML"; - $input = $inputStrippedXML; -} - -# get tables to be filtered (and modify config file) -my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%KNOWN_TTABLE,@TABLE_WEIGHTS,%TABLE_NUMBER); - -my %new_name_used = (); -open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini"; -open(INI,$config) or die "Can't read $config"; -while(my $line = ) { - chomp($line); - my @toks = split(/ /, $line); - if ($line =~ /PhraseDictionaryMemory / - || $line =~ /PhraseDictionaryBinary / - || $line =~ /PhraseDictionaryOnDisk / - || $line =~ /PhraseDictionarySCFG / - || $line =~ /RuleTable / - ) { - print STDERR "pt:$line\n"; - - my ($phrase_table_impl,$source_factor,$t,$w,$file,$table_flag,$skip); # = ($1,$2,$3,$4,$5,$6,$7); - $table_flag = ""; - $phrase_table_impl = $toks[0]; - $skip = 0; - - for (my $i = 1; $i < scalar(@toks); ++$i) { - my @args = split(/=/, $toks[$i]); - chomp($args[0]); - chomp($args[1]); - - if ($args[0] eq "num-features") { - $w = $args[1]; - } - elsif ($args[0] eq "input-factor") { - $source_factor = $args[1]; - } - elsif ($args[0] eq "output-factor") { - $t = $args[1]; - } - elsif ($args[0] eq "path") { - $file = $args[1]; - } - elsif ($args[0] eq "filterable" && $args[1] eq "false") { - $skip = 1; - } - } #for (my $i = 1; $i < scalar(@toks); ++$i) { - - if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG" && $phrase_table_impl ne "RuleTable") || $file =~ /glue-grammar/ || $skip) { - # Only Memory ("0") and NewFormat ("6") can be filtered. - print INI_OUT "$line\n"; - next; - } - - push @TABLE, $file; - push @TABLE_WEIGHTS,$w; - $KNOWN_TTABLE{$#TABLE}++; - - my $new_name = "$dir/phrase-table.$source_factor-$t.".(++$TABLE_NUMBER{"$source_factor-$t"}); - my $cnt = 1; - $cnt ++ while (defined $new_name_used{"$new_name.$cnt"}); - $new_name .= ".$cnt"; - $new_name_used{$new_name} = 1; - if ($binarizer && $phrase_table_impl eq "PhraseDictionarySCFG") { - $phrase_table_impl = "PhraseDictionaryOnDisk"; - @toks = set_value(\@toks, "path", "$new_name.bin$table_flag"); - } - elsif ($binarizer && $phrase_table_impl eq "PhraseDictionaryMemory") { - if ($binarizer =~ /processPhraseTableMin/) { - $phrase_table_impl = "PhraseDictionaryCompact"; - @toks = set_value(\@toks, "path", "$new_name$table_flag"); - } - elsif ($binarizer =~ /CreateOnDiskPt/) { - $phrase_table_impl = "PhraseDictionaryOnDisk"; - @toks = set_value(\@toks, "path", "$new_name.bin$table_flag"); - } - else { - $phrase_table_impl = "PhraseDictionaryBinary"; - @toks = set_value(\@toks, "path", "$new_name$table_flag"); - } - } - else { - $new_name .= ".gz" if $opt_gzip; - @toks = set_value(\@toks, "path", "$new_name$table_flag"); - } - - $toks[0] = $phrase_table_impl; - - print INI_OUT join_array(\@toks)."\n"; - - push @TABLE_NEW_NAME,$new_name; - - $CONSIDER_FACTORS{$source_factor} = 1; - print STDERR "Considering factor $source_factor\n"; - push @TABLE_FACTORS, $source_factor; - - } #if (/PhraseModel /) { - elsif ($line =~ /LexicalReordering /) { - print STDERR "ro:$line\n"; - my ($source_factor, $t, $w, $file); # = ($1,$2,$3,$4); - - for (my $i = 1; $i < scalar(@toks); ++$i) { - my @args = split(/=/, $toks[$i]); - chomp($args[0]); - chomp($args[1]); - - if ($args[0] eq "num-features") { - $w = $args[1]; - } - elsif ($args[0] eq "input-factor") { - $source_factor = chomp($args[1]); - } - elsif ($args[0] eq "output-factor") { - #$t = chomp($args[1]); - } - elsif ($args[0] eq "type") { - $t = $args[1]; - } - elsif ($args[0] eq "path") { - $file = $args[1]; - } - - } # for (my $i = 1; $i < scalar(@toks); ++$i) { - - push @TABLE, $file; - push @TABLE_WEIGHTS,$w; - - $file =~ s/^.*\/+([^\/]+)/$1/g; - my $new_name = "$dir/$file"; - $new_name =~ s/\.gz//; - - #print INI_OUT "$source_factor $t $w $new_name\n"; - @toks = set_value(\@toks, "path", "$new_name"); - print INI_OUT join_array(\@toks)."\n"; - - push @TABLE_NEW_NAME,$new_name; - - $CONSIDER_FACTORS{$source_factor} = 1; - print STDERR "Considering factor $source_factor\n"; - push @TABLE_FACTORS,$source_factor; - - - } #elsif (/LexicalReordering /) { - else { - print INI_OUT "$line\n"; - } -} # while() { -close(INI); -close(INI_OUT); - -my %TMP_INPUT_FILENAME; - -if ($opt_hierarchical) { - if (!$opt_strip_xml) { - print STDERR "WARNING: source factor reduction is disabled due to use of -noStripXML option\n"; - } else { - # Write a separate, temporary input file for each combination of source - # factors - foreach my $key (keys %CONSIDER_FACTORS) { - my $filename = "$dir/input-$key"; - open(FILEHANDLE,">$filename") or die "Can't open $filename for writing"; - $TMP_INPUT_FILENAME{$key} = $filename; - my @FACTOR = split(/,/, $key); - my $cmd = "$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |"; - print STDERR "Executing: $cmd\n"; - open(PIPE,$cmd); - while (my $line = ) { - print FILEHANDLE $line - } - close(FILEHANDLE); - } - } -} - -my %PHRASE_USED; -if ($opt_filter && !$opt_hierarchical) { - # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH - open(INPUT,mk_open_string($input)) or die "Can't read $input"; - while(my $line = ) { - chomp($line); - my @WORD = split(/ +/,$line); - for(my $i=0;$i<=$#WORD;$i++) { - for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) { - foreach (keys %CONSIDER_FACTORS) { - my @FACTOR = split(/,/); - my $phrase = ""; - for(my $k=$i;$k<=$i+$j;$k++) { - my @WORD_FACTOR = split(/\|/,$WORD[$k]); - for(my $f=0;$f<=$#FACTOR;$f++) { - $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|"; - } - chop($phrase); - $phrase .= " "; - } - chop($phrase); - $PHRASE_USED{$_}{$phrase}++; - } - } - } - } - close(INPUT); -} - -# filter files -print STDERR "Filtering files...\n"; -for(my $i=0;$i<=$#TABLE;$i++) { - my ($used,$total) = (0,0); - my $file = $TABLE[$i]; - my $factors = $TABLE_FACTORS[$i]; - my $new_file = $TABLE_NEW_NAME[$i]; - print STDERR "filtering $file -> $new_file...\n"; - my $mid_file = $new_file; # used when both filtering and binarizing - if (!$opt_filter) { - # check if original file was gzipped - if ($file !~ /\.gz$/ && -e "$file.gz") { - $file .= ".gz"; - } - $mid_file .= ".gz" if $file =~ /\.gz$/; - $cmd = "ln -s $file $mid_file"; - safesystem($cmd) or die "Failed to make symlink"; - } else { - - $mid_file .= ".gz" - if $mid_file !~ /\.gz/ - && $binarizer && $binarizer =~ /processPhraseTable/; - - my $openstring = mk_open_string($file); - - my $mid_openstring; - if ($mid_file =~ /\.gz$/) { - $mid_openstring = "| gzip -c > $mid_file"; - } else { - $mid_openstring = ">$mid_file"; - } - - - open(FILE_OUT,$mid_openstring) or die "Can't write to $mid_openstring"; - - if ($opt_hierarchical) { - my $input_file = $opt_strip_xml ? $TMP_INPUT_FILENAME{$factors} : $input; - $cmd = "$openstring $syntax_filter_cmd $input_file |"; - print STDERR "Executing: $cmd\n"; - open(PIPE,$cmd); - while (my $line = ) { - print FILE_OUT $line - } - close(FILEHANDLE); - } else { - open(FILE,$openstring) or die "Can't open '$openstring'"; - while(my $entry = ) { - my ($foreign,$rest) = split(/ \|\|\| /,$entry,2); - $foreign =~ s/ $//; - if (defined($PHRASE_USED{$factors}{$foreign})) { - # handle min_score thresholds - if ($min_score) { - my @ITEM = split(/ *\|\|\| */,$rest); - if(scalar (@ITEM)>2) { # do not filter reordering table - my @SCORE = split(/ /,$ITEM[1]); - my $okay = 1; - foreach my $id (keys %MIN_SCORE) { - $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id}; - } - next unless $okay; - } - } - print FILE_OUT $entry; - $used++; - } - $total++; - } - close(FILE); - die "No phrases found in $file!" if $total == 0; - printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; - } - - close(FILE_OUT); - - } - - my $catcmd = ($mid_file =~ /\.gz$/ ? "$ZCAT" : "cat"); - if(defined($binarizer)) { - print STDERR "binarizing...\n"; - # translation model - if ($KNOWN_TTABLE{$i}) { - # ... hierarchical translation model - if ($opt_hierarchical) { - my $cmd = "$binarizer $mid_file $new_file.bin"; - safesystem($cmd) or die "Can't binarize"; - } - # ... phrase translation model - elsif ($binarizer =~ /processPhraseTableMin/) { - #compact phrase table - ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; - my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None"; - safesystem($cmd) or die "Can't binarize"; - } elsif ($binarizer =~ /CreateOnDiskPt/) { - my $cmd = "$binarizer $mid_file $new_file.bin"; - safesystem($cmd) or die "Can't binarize"; - } else { - my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file"; - safesystem($cmd) or die "Can't binarize"; - } - } - # reordering model - else { - my $lexbin; - $lexbin = $binarizer; - if ($binarizer =~ /CreateOnDiskPt/) { - $lexbin =~ s/CreateOnDiskPt/processLexicalTable/; - } - - $lexbin =~ s/PhraseTable/LexicalTable/; - my $cmd; - if ($lexbin =~ /processLexicalTableMin/) { - $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted"; - } else { - $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options - $cmd = "$lexbin -in $mid_file -out $new_file"; - } - safesystem($cmd) or die "Can't binarize"; - } - } -} - -# Remove any temporary input files -unlink values %TMP_INPUT_FILENAME; - -open(INFO,">$dir/info"); -print INFO "$config\n$input\n"; -close(INFO); - - -print "To run the decoder, please call: - moses -f $dir/moses.ini -i $input\n"; - -# functions -sub mk_open_string { - my $file = shift; - my $openstring; - if ($file !~ /\.gz$/ && -e "$file.gz") { - $openstring = "$ZCAT $file.gz |"; - } elsif ($file =~ /\.gz$/) { - $openstring = "$ZCAT $file |"; - } elsif ($opt_hierarchical) { - $openstring = "cat $file |"; - } else { - $openstring = "< $file"; - } - return $openstring; -} - - -sub safesystem { - print STDERR "Executing: @_\n"; - system("bash", "-c", @_); - if ($? == -1) { - print STDERR "Failed to execute: @_\n $!\n"; - exit(1); - } - elsif ($? & 127) { - printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - exit(1); - } - else { - my $exitcode = $? >> 8; - print STDERR "Exit code: $exitcode\n" if $exitcode; - return ! $exitcode; - } -} - -sub ensure_full_path { - my $PATH = shift; - return $PATH if $PATH =~ /^\//; - my $dir = `pawd 2>/dev/null`; - if (!$dir) {$dir = `pwd`;} - chomp $dir; - $PATH = $dir."/".$PATH; - $PATH =~ s/[\r\n]//g; - $PATH =~ s/\/\.\//\//g; - $PATH =~ s/\/+/\//g; - my $sanity = 0; - while($PATH =~ /\/\.\.\// && $sanity++<10) { - $PATH =~ s/\/+/\//g; - $PATH =~ s/\/[^\/]+\/\.\.\//\//g; - } - $PATH =~ s/\/[^\/]+\/\.\.$//; - $PATH =~ s/\/+$//; - return $PATH; -} - -sub join_array { - my @outside = @{$_[0]}; - - my $ret = ""; - for (my $i = 0; $i < scalar(@outside); ++$i) { - my $tok = $outside[$i]; - $ret .= "$tok "; - } - - return $ret; -} - -sub set_value { - my @arr = @{$_[0]}; - my $keySought = $_[1]; - my $newValue = $_[2]; - - for (my $i = 1; $i < scalar(@arr); ++$i) { - my @inside = split(/=/, $arr[$i]); - - my $key = $inside[0]; - if ($key eq $keySought) { - $arr[$i] = "$key=$newValue"; - return @arr; - } - } - return @arr; -} - - -- cgit v1.2.3 From e1fcc8082a29762946b92e1b068551481fc2b743 Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 24 Apr 2015 19:30:40 +0200 Subject: use integer type when reading options instead of checking for undef. it's more elegant. --- scripts/training/filter-model-given-input.pl | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index 9373e44c1..84dbbe879 100644 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -36,7 +36,7 @@ my $ZCAT = "gzip -cd"; # get optional parameters my $opt_hierarchical = 0; my $binarizer = undef; -my $threads = undef; # Default is single-thread, i.e. $threads=1 +my $threads = 1; # Default is single-thread, i.e. $threads=1 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; my $min_score = undef; my $opt_min_non_initial_rule_count = undef; @@ -54,7 +54,7 @@ GetOptions( "SyntaxFilterCmd=s" => \$syntax_filter_cmd, "tempdir=s" => \$tempdir, "MinScore=s" => \$min_score, - "threads" => \$threads, + "threads=i" => \$threads, "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count, # DEPRECATED ) or exit(1); @@ -407,9 +407,6 @@ for(my $i=0;$i<=$#TABLE;$i++) { elsif ($binarizer =~ /processPhraseTableMin/) { #compact phrase table ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; - if(!defined($threads)) { - $threads = 1 - } my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None"; safesystem($cmd) or die "Can't binarize"; } elsif ($binarizer =~ /CreateOnDiskPt/) { @@ -431,9 +428,6 @@ for(my $i=0;$i<=$#TABLE;$i++) { $lexbin =~ s/PhraseTable/LexicalTable/; my $cmd; if ($lexbin =~ /processLexicalTableMin/) { - if(!defined($threads)) { - $threads = 1 - } $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted"; } else { $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options -- cgit v1.2.3 From c01b0a6262fbf92e1908ebc31c5f0894c489a2cf Mon Sep 17 00:00:00 2001 From: alvations Date: Sun, 26 Apr 2015 20:25:15 +0200 Subject: merging the filter-model-given-input.pl with alvations-master branch --- scripts/training/filter-model-given-input.pl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index 7dec0762c..84dbbe879 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/perl -w # $Id$ # Given a moses.ini file and an input text prepare minimized translation @@ -8,7 +8,6 @@ # changes by Ondrej Bojar # adapted for hierarchical models by Phil Williams -use warnings; use strict; use FindBin qw($RealBin); @@ -37,6 +36,7 @@ my $ZCAT = "gzip -cd"; # get optional parameters my $opt_hierarchical = 0; my $binarizer = undef; +my $threads = 1; # Default is single-thread, i.e. $threads=1 my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; my $min_score = undef; my $opt_min_non_initial_rule_count = undef; @@ -54,6 +54,7 @@ GetOptions( "SyntaxFilterCmd=s" => \$syntax_filter_cmd, "tempdir=s" => \$tempdir, "MinScore=s" => \$min_score, + "threads=i" => \$threads, "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count, # DEPRECATED ) or exit(1); @@ -63,7 +64,7 @@ my $config = shift; my $input = shift; if (!defined $dir || !defined $config || !defined $input) { - print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd]\n"; + print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd] [-threads num]\n"; exit 1; } $dir = ensure_full_path($dir); @@ -405,7 +406,8 @@ for(my $i=0;$i<=$#TABLE;$i++) { # ... phrase translation model elsif ($binarizer =~ /processPhraseTableMin/) { #compact phrase table - my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] && rm $mid_file.sorted"; + ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; + my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None"; safesystem($cmd) or die "Can't binarize"; } elsif ($binarizer =~ /CreateOnDiskPt/) { my $cmd = "$binarizer $mid_file $new_file.bin"; @@ -426,7 +428,7 @@ for(my $i=0;$i<=$#TABLE;$i++) { $lexbin =~ s/PhraseTable/LexicalTable/; my $cmd; if ($lexbin =~ /processLexicalTableMin/) { - $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file && rm $mid_file.sorted"; + $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted"; } else { $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options $cmd = "$lexbin -in $mid_file -out $new_file"; -- cgit v1.2.3 From ec54ea3c4fcdb055661dba1fe3003d6bb1a0bed8 Mon Sep 17 00:00:00 2001 From: alvations Date: Sun, 26 Apr 2015 20:30:15 +0200 Subject: put back some of the difference made after RELEASE3.0 and incorporated it with the -threads parameter --- scripts/training/filter-model-given-input.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index 84dbbe879..1464fdb73 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ # Given a moses.ini file and an input text prepare minimized translation @@ -8,6 +8,7 @@ # changes by Ondrej Bojar # adapted for hierarchical models by Phil Williams +use warnings; use strict; use FindBin qw($RealBin); @@ -406,8 +407,7 @@ for(my $i=0;$i<=$#TABLE;$i++) { # ... phrase translation model elsif ($binarizer =~ /processPhraseTableMin/) { #compact phrase table - ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; - my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None"; + my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; safesystem($cmd) or die "Can't binarize"; } elsif ($binarizer =~ /CreateOnDiskPt/) { my $cmd = "$binarizer $mid_file $new_file.bin"; -- cgit v1.2.3 From 4a68c42b16626e2ee707e93a6453eda51dc807a1 Mon Sep 17 00:00:00 2001 From: alvations Date: Sun, 26 Apr 2015 20:37:10 +0200 Subject: syncing to latest moses version --- scripts/training/filter-model-given-input.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index 84dbbe879..1464fdb73 100644 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ # Given a moses.ini file and an input text prepare minimized translation @@ -8,6 +8,7 @@ # changes by Ondrej Bojar # adapted for hierarchical models by Phil Williams +use warnings; use strict; use FindBin qw($RealBin); @@ -406,8 +407,7 @@ for(my $i=0;$i<=$#TABLE;$i++) { # ... phrase translation model elsif ($binarizer =~ /processPhraseTableMin/) { #compact phrase table - ##my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; - my $cmd = "$binarizer -in <($catcmd $mid_file | LC_ALL=C sort -T $tempdir) -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads -encoding None"; + my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; safesystem($cmd) or die "Can't binarize"; } elsif ($binarizer =~ /CreateOnDiskPt/) { my $cmd = "$binarizer $mid_file $new_file.bin"; -- cgit v1.2.3 From da648fd65b7de9709b5cd3b094138f68a9584de0 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Mon, 27 Apr 2015 10:52:16 +0100 Subject: fix some RDLM training options --- scripts/training/rdlm/README | 4 ++-- scripts/training/rdlm/extract_syntactic_ngrams.py | 5 +++-- scripts/training/rdlm/extract_vocab.py | 4 ---- scripts/training/rdlm/train_rdlm.py | 17 +++++++++-------- 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'scripts') diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README index 209daf1c0..ca2a06015 100644 --- a/scripts/training/rdlm/README +++ b/scripts/training/rdlm/README @@ -31,8 +31,8 @@ RDLM is split into two neural network models, which can be trained with mkdir working_dir_head mkdir working_dir_label - ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise-samples 100 - ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50 + ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise 100 + ./train_rdlm.py --nplm-home /path/to/nplm --corpus [your_training_corpus] --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise 50 for more options, run `train_rdlm.py --help`. Parameters you may want to adjust include the vocabulary size of the label model (depending on the number of diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py index eca1b3a49..f3ce41080 100755 --- a/scripts/training/rdlm/extract_syntactic_ngrams.py +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -113,13 +113,14 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p int_list.extend(parent_heads) int_list.extend(parent_labels) + # write root of tree if options.mode == 'label': int_list.append(output_vocab.get(label, 0)) - sys.stdout.write(' '.join(map(str, int_list)) + '\n') + options.output.write(' '.join(map(str, int_list)) + '\n') elif options.mode == 'head' and not head == '': int_list.append(vocab.get(label, 0)) int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0))) - sys.stdout.write(' '.join(map(str, int_list)) + '\n') + options.output.write(' '.join(map(str, int_list)) + '\n') parent_heads.append(vocab.get(head, 0)) parent_labels.append(vocab.get(label, 0)) diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py index 55ecbe554..6d017602e 100755 --- a/scripts/training/rdlm/extract_vocab.py +++ b/scripts/training/rdlm/extract_vocab.py @@ -59,10 +59,6 @@ def get_head(xml, args): preterminal = child.get('label') head = escape_text(child.text.strip()) - # hack for split compounds - elif child[-1].get('label') == 'SEGMENT': - return escape_text(child[-1].text.strip()), 'SEGMENT' - elif args.ptkvz and head and child.get('label') == 'avz': for grandchild in child: if grandchild.get('label') == 'PTKVZ': diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py index 1e7ecac52..15e56c430 100755 --- a/scripts/training/rdlm/train_rdlm.py +++ b/scripts/training/rdlm/train_rdlm.py @@ -43,7 +43,7 @@ parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)") parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)") parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)") -parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)") +parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)") parser.set_defaults( @@ -95,7 +95,7 @@ def prepare_vocabulary(options): filtered_vocab = open(orig).readlines() orig = vocab_prefix + '.nonterminals' filtered_vocab += open(orig).readlines() - filtered_vocab = [word for word in filtered_vocab if not word.startswith(prefix) for prefix in blacklist] + filtered_vocab = [word for word in filtered_vocab if not any(word.startswith(prefix) for prefix in blacklist)] if options.output_vocab_size: filtered_vocab = filtered_vocab[:options.output_vocab_size] else: @@ -127,12 +127,13 @@ def main(options): sys.stderr.write('extracting syntactic n-grams\n') extract_syntactic_ngrams.main(extract_options) - if validation_corpus: - extract_options.input = options.validation_corpus - options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized') - extract_options.output = options.validation_file + if options.validation_corpus: + extract_options.input = open(options.validation_corpus) + options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus)) + extract_options.output = open(options.validation_file + '.numberized', 'w') sys.stderr.write('extracting syntactic n-grams (validation file)\n') extract_syntactic_ngrams.main(extract_options) + extract_options.output.close() sys.stderr.write('training neural network\n') train_nplm.main(options) @@ -141,8 +142,8 @@ def main(options): ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'), options.nplm_home, os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)), - os.path.join(options.working_dir, options.corpus_stem + '.numberized'), - os.path.join(options.output_dir, options.output_model + '.model.nplm.') + os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'), + os.path.join(options.output_dir, options.output_model + '.model.nplm') ]) if ret: raise Exception("averaging null words failed") -- cgit v1.2.3 From a47fc006359b68eea2fcc369fae983338226a925 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 27 Apr 2015 17:35:19 +0400 Subject: option to output factors --- scripts/training/wrappers/madamira-wrapper.perl | 57 +++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 4 deletions(-) (limited to 'scripts') diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl index 6e7efe245..9866e6421 100755 --- a/scripts/training/wrappers/madamira-wrapper.perl +++ b/scripts/training/wrappers/madamira-wrapper.perl @@ -8,21 +8,32 @@ use File::Basename; use FindBin qw($RealBin); use Cwd 'abs_path'; +sub GetFactors; + + my $TMPDIR = "tmp"; my $SCHEME = "D2"; my $KEEP_TMP = 0; my $MADA_DIR; +my $FACTORS_STR; +my @FACTORS; + GetOptions( "scheme=s" => \$SCHEME, "tmpdir=s" => \$TMPDIR, "keep-tmp" => \$KEEP_TMP, - "mada-dir=s" => \$MADA_DIR + "mada-dir=s" => \$MADA_DIR, + "factors=s" => \$FACTORS_STR ) or die("ERROR: unknown options"); $TMPDIR = abs_path($TMPDIR); print STDERR "TMPDIR=$TMPDIR \n"; +if (defined($FACTORS_STR)) { + @FACTORS = split(",", $FACTORS_STR); +} + #binmode(STDIN, ":utf8"); #binmode(STDOUT, ":utf8"); @@ -75,13 +86,21 @@ while(my $line = ) { print "\n"; } elsif (index($line, ";;WORD") == 0) { - # word + # word my $word = substr($line, 7, length($line) - 8); - #print STDERR "FOund $word\n"; + #print STDERR "FOund $word\n"; + + for (my $i = 0; $i < 4; ++$i) { + $line = ; + } + + my $factors = GetFactors($line, \@FACTORS); + $word .= $factors; + print "$word "; } else { - #print STDERR "NADA\n"; + #print STDERR "NADA\n"; } } close (MADA_OUT); @@ -91,3 +110,33 @@ if ($KEEP_TMP == 0) { # `rm -rf $TMPDIR`; } + +########################### +sub GetFactors +{ + my $line = shift; + my $factorsRef = shift; + my @factors = @{$factorsRef}; + + # all factors + my %allFactors; + my @toks = split(" ", $line); + for (my $i = 1; $i < scalar(@toks); ++$i) { + #print " tok=" .$toks[$i]; + + my ($key, $value) = split(":", $toks[$i]); + $allFactors{$key} = $value; + } + + my $ret = ""; + my $factorType; + foreach $factorType(@factors) { + #print "factorType=$factorType "; + my $value = $allFactors{$factorType}; + + $ret .= "|$value"; + } + + return $ret; +} + -- cgit v1.2.3 From 8adad4fc2e1ae609ffbd8fe76261540cac19a125 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 27 Apr 2015 17:39:49 +0400 Subject: exec permission --- scripts/training/filter-model-given-input.pl | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/training/filter-model-given-input.pl (limited to 'scripts') diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl old mode 100644 new mode 100755 -- cgit v1.2.3 From b7792b227a337c36d97d3c0979d11e6955ba368c Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 28 Apr 2015 12:29:58 +0400 Subject: script to convert arabic to bw, and vice versa --- scripts/other/buckwalter.perl | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100755 scripts/other/buckwalter.perl (limited to 'scripts') diff --git a/scripts/other/buckwalter.perl b/scripts/other/buckwalter.perl new file mode 100755 index 000000000..62544e212 --- /dev/null +++ b/scripts/other/buckwalter.perl @@ -0,0 +1,33 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Encode::Arabic::Buckwalter; +use Getopt::Long "GetOptions"; + +my $direction; +GetOptions('direction=i' => \$direction) + or exit(1); +# direction: 1=arabic->bw, 2=bw->arabic + +die("ERROR: need to set direction") unless defined($direction); + + + +while (my $line = ) { + chomp($line); + + my $lineOut; + if ($direction == 1) { + $lineOut = encode 'buckwalter', decode 'utf8', $line; + } + elsif ($direction == 2) { + $lineOut = encode 'utf8', decode 'buckwalter', $line; + } + else { + die("Unknown direction: $direction"); + } + print "$lineOut\n"; + +} + -- cgit v1.2.3 From 8f9bf7ea386feb1aef5413730bd627a1161c5928 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 28 Apr 2015 15:03:59 +0400 Subject: add -config --- scripts/training/wrappers/madamira-wrapper.perl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'scripts') diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl index 9866e6421..5c1d0404f 100755 --- a/scripts/training/wrappers/madamira-wrapper.perl +++ b/scripts/training/wrappers/madamira-wrapper.perl @@ -15,6 +15,7 @@ my $TMPDIR = "tmp"; my $SCHEME = "D2"; my $KEEP_TMP = 0; my $MADA_DIR; +my $CONFIG; my $FACTORS_STR; my @FACTORS; @@ -24,9 +25,14 @@ GetOptions( "tmpdir=s" => \$TMPDIR, "keep-tmp" => \$KEEP_TMP, "mada-dir=s" => \$MADA_DIR, - "factors=s" => \$FACTORS_STR + "factors=s" => \$FACTORS_STR, + "config=s" => \$CONFIG ) or die("ERROR: unknown options"); +if (!defined($CONFIG)) { + $CONFIG = "$MADA_DIR/samples/sampleConfigFile.xml"; +} + $TMPDIR = abs_path($TMPDIR); print STDERR "TMPDIR=$TMPDIR \n"; @@ -65,7 +71,7 @@ else { $cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x"; `$cmd`; -$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml ::: $TMPDIR/split/x*"; +$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*"; print STDERR "Executing: $cmd\n"; `$cmd`; @@ -77,7 +83,7 @@ print STDERR "Executing: $cmd\n"; open(MADA_OUT,"<$infile.mada"); #binmode(MADA_OUT, ":utf8"); while(my $line = ) { - chop($line); + chomp($line); #print STDERR "line=$line \n"; if (index($line, "SENTENCE BREAK") == 0) { -- cgit v1.2.3