From 6d1165654caf8edc995a41a4c6c9666e65ebce96 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Mon, 28 May 2012 20:15:58 +0100 Subject: script updates and added ems config help --- scripts/ems/example/config.basic | 3 +- scripts/ems/example/config.factored | 3 +- scripts/ems/example/config.hierarchical | 3 +- scripts/ems/example/config.syntax | 3 +- scripts/ems/example/config.toy | 3 +- scripts/generic/compound-splitter.perl | 174 +++++++++++++++++++++++++++----- 6 files changed, 161 insertions(+), 28 deletions(-) diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic index c08f51764..939e13aad 100644 --- a/scripts/ems/example/config.basic +++ b/scripts/ems/example/config.basic @@ -260,7 +260,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored index 4bc198a6b..df9f28f33 100644 --- a/scripts/ems/example/config.factored +++ b/scripts/ems/example/config.factored @@ -280,7 +280,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical index b9858f393..6161f6ac4 100644 --- a/scripts/ems/example/config.hierarchical +++ b/scripts/ems/example/config.hierarchical @@ -260,7 +260,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax index 7c97b9ac4..635585844 100644 --- a/scripts/ems/example/config.syntax +++ b/scripts/ems/example/config.syntax @@ -264,7 +264,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy index 140a45229..7b8c95faa 100644 --- a/scripts/ems/example/config.toy +++ b/scripts/ems/example/config.toy @@ -244,7 +244,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index ced661e3f..9948c648e 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -8,15 +8,23 @@ my $FILLER = ":s:es"; my $MIN_SIZE = 3; my $MIN_COUNT = 5; my $MAX_COUNT = 5; +my $FACTORED = 0; +my $SYNTAX = 0; +my $MARK_SPLIT = 0; +my $BINARIZE = 0; $HELP = 1 unless &GetOptions('corpus=s' => \$CORPUS, 'model=s' => \$MODEL, 'filler=s' => \$FILLER, + 'factored' => \$FACTORED, 'min-size=i' => \$MIN_SIZE, 'min-count=i' => \$MIN_COUNT, 'max-count=i' => \$MAX_COUNT, 'help' => \$HELP, 'verbose' => \$VERBOSE, + 'syntax' => \$SYNTAX, + 'binarize' => \$BINARIZE, + 'mark-split' => \$MARK_SPLIT, 'train' => \$TRAIN); if ($HELP || @@ -29,59 +37,152 @@ if ($HELP || print "options: -min-size: minimum word size (default $MIN_SIZE)\n"; print " -min-count: minimum word count (default $MIN_COUNT)\n"; print " -filler: filler letters between words (default $FILLER)\n"; + print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n"; + print " -syntax: syntactically parsed data (default $SYNTAX)\n"; + print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n"; + print " -binarize: binarize subtree for split word (default $BINARIZE)\n"; exit; } if ($TRAIN) { - &train; + if ($SYNTAX) { &train_syntax(); } + elsif ($FACTORED) { &train_factored(); } + else { &train(); } } else { - &apply; + &apply(); } sub train { - my %WORD; + my %COUNT; open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); while() { chop; s/\s+/ /g; s/^ //; s/ $//; foreach (split) { - $WORD{$_}++; + $COUNT{$_}++; } } - close($CORPUS); + close(CORPUS); + &save_trained_model(\%COUNT); +} + +sub save_trained_model { + my ($COUNT) = @_; my $id = 0; open(MODEL,">".$MODEL); - foreach my $word (keys %WORD) { - print MODEL "".(++$id)."\t".$word."\t".$WORD{$word}."\n"; + foreach my $word (keys %$COUNT) { + print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n"; } close(MODEL); - print STDERR "written model file with ".(scalar keys %WORD)." words.\n"; + print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n"; +} + +sub train_factored { + my (%COUNT,%FACTORED_COUNT); + # collect counts for interpretations for each surface word + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while() { + chop; s/\s+/ /g; s/^ //; s/ $//; + foreach my $factored_word (split) { + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + $FACTORED_COUNT{$word}{$factored_word}++; + } + } + close(CORPUS); + # only preserve most frequent interpretation, assign sum of counts + foreach my $word (keys %FACTORED_COUNT) { + my ($max,$best,$total) = (0,"",0); + foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) { + my $count = $FACTORED_COUNT{$word}{$factored_word}; + $total += $count; + if ($count > $max) { + $max = $count; + $best = $factored_word; + } + } + $COUNT{$best} = $total; + } + &save_trained_model(\%COUNT); +} + +sub train_syntax { + my (%COUNT,%LABELED_COUNT); + # collect counts for interpretations for each surface word + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while() { + chop; s/\s+/ /g; s/^ //; s/ $//; + my $label; + foreach (split) { + if (/^label="([^\"]+)"/) { + $label = $1; + } + elsif (! /^ $max) { + $max = $count; + $best = "$word $label"; + } + } + $COUNT{$best} = $total; + } + &save_trained_model(\%COUNT); } sub apply { - my (%WORD,%TRUECASE); + my (%COUNT,%TRUECASE,%LABEL); open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'"); while() { chomp; - my ($id,$word,$count) = split(/\t/); + my ($id,$factored_word,$count) = split(/\t/); + my $label; + ($factored_word,$label) = split(/ /,$factored_word); + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor my $lc = lc($word); # if word exists with multipe casings, only record most frequent - next if defined($WORD{$lc}) && $WORD{$lc} > $count; - $WORD{$lc} = $count; - $TRUECASE{$lc} = $word; + next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; + $COUNT{$lc} = $count; + $TRUECASE{$lc} = $factored_word; + $LABEL{$lc} = $label if $SYNTAX; } close(MODEL); while() { my $first = 1; chop; s/\s+/ /g; s/^ //; s/ $//; - foreach my $word (split) { + my @BUFFER; # for xml tags + foreach my $factored_word (split) { print " " unless $first; $first = 0; + # syntax: don't split xml + if ($SYNTAX && ($factored_word =~ /^$/)) { + push @BUFFER,$factored_word; + $first = 1; + next; + } + + # get case class + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + my $lc = lc($word); + # don't split frequent words - if (defined($WORD{$word}) && $WORD{$word}>=$MAX_COUNT) { - print $word; + if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) { + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; next; } @@ -100,17 +201,18 @@ sub apply { my $subword = lc(substr($word, $start+length($filler), $end-$start+1-length($filler))); - next unless defined($WORD{$subword}); - next unless $WORD{$subword} >= $MIN_COUNT; - print STDERR "\tmatching word $start .. $end ($filler)$subword $WORD{$subword}\n" if $VERBOSE; - push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $WORD{$subword}"; + next unless defined($COUNT{$subword}); + next unless $COUNT{$subword} >= $MIN_COUNT; + print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE; + push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}"; } } } # no matches at all? if (!defined($REACHABLE{$final})) { - print $word; + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; next; } @@ -152,9 +254,35 @@ sub apply { last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final}; for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; } } - $best_split = $word unless $best_split =~ / /; # do not change case for unsplit words - print $best_split; + if ($best_split !~ / /) { + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $word; # do not change case for unsplit words + next; + } + if (!$SYNTAX) { + print $best_split; + } + else { + $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT; + $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n"); + my $pos = $1; + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + + my @SPLIT = split(/ /,$best_split); + my @OUT = (); + if ($BINARIZE) { + for(my $w=0;$w"; + } + } + for(my $w=0;$w=2) { push @OUT, ""; } + push @OUT," $SPLIT[$w] "; + } + print join(" ",@OUT); + } } + print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer print "\n"; } } -- cgit v1.2.3