Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-05-28 23:15:58 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-05-28 23:15:58 +0400
commit6d1165654caf8edc995a41a4c6c9666e65ebce96 (patch)
treeb4562c29ba79c5398e20b051ae6eb7f950a305e4
parentdb1e6040b241c74ed01b9da0e4a8bd2f4c15f176 (diff)
script updates and added ems config help
-rw-r--r--scripts/ems/example/config.basic3
-rw-r--r--scripts/ems/example/config.factored3
-rw-r--r--scripts/ems/example/config.hierarchical3
-rw-r--r--scripts/ems/example/config.syntax3
-rw-r--r--scripts/ems/example/config.toy3
-rwxr-xr-xscripts/generic/compound-splitter.perl174
6 files changed, 161 insertions, 28 deletions
diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic
index c08f51764..939e13aad 100644
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@@ -260,7 +260,8 @@ script = $moses-script-dir/training/train-model.perl
### general options
# these are options that are passed on to train-model.perl, for instance
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
-# * "-sort-buffer-size 8G" to reduce on-disk sorting
+# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
+# * "-sort-parallel 8 -cores 8" to speed up phrase table building
#
#training-options = ""
diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored
index 4bc198a6b..df9f28f33 100644
--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@@ -280,7 +280,8 @@ script = $moses-script-dir/training/train-model.perl
### general options
# these are options that are passed on to train-model.perl, for instance
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
-# * "-sort-buffer-size 8G" to reduce on-disk sorting
+# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
+# * "-sort-parallel 8 -cores 8" to speed up phrase table building
#
#training-options = ""
diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical
index b9858f393..6161f6ac4 100644
--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@@ -260,7 +260,8 @@ script = $moses-script-dir/training/train-model.perl
### general options
# these are options that are passed on to train-model.perl, for instance
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
-# * "-sort-buffer-size 8G" to reduce on-disk sorting
+# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
+# * "-sort-parallel 8 -cores 8" to speed up phrase table building
#
#training-options = ""
diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax
index 7c97b9ac4..635585844 100644
--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@@ -264,7 +264,8 @@ script = $moses-script-dir/training/train-model.perl
### general options
# these are options that are passed on to train-model.perl, for instance
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
-# * "-sort-buffer-size 8G" to reduce on-disk sorting
+# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
+# * "-sort-parallel 8 -cores 8" to speed up phrase table building
#
#training-options = ""
diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy
index 140a45229..7b8c95faa 100644
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@@ -244,7 +244,8 @@ script = $moses-script-dir/training/train-model.perl
### general options
# these are options that are passed on to train-model.perl, for instance
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
-# * "-sort-buffer-size 8G" to reduce on-disk sorting
+# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
+# * "-sort-parallel 8 -cores 8" to speed up phrase table building
#
#training-options = ""
diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl
index ced661e3f..9948c648e 100755
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@@ -8,15 +8,23 @@ my $FILLER = ":s:es";
my $MIN_SIZE = 3;
my $MIN_COUNT = 5;
my $MAX_COUNT = 5;
+my $FACTORED = 0;
+my $SYNTAX = 0;
+my $MARK_SPLIT = 0;
+my $BINARIZE = 0;
$HELP = 1
unless &GetOptions('corpus=s' => \$CORPUS,
'model=s' => \$MODEL,
'filler=s' => \$FILLER,
+ 'factored' => \$FACTORED,
'min-size=i' => \$MIN_SIZE,
'min-count=i' => \$MIN_COUNT,
'max-count=i' => \$MAX_COUNT,
'help' => \$HELP,
'verbose' => \$VERBOSE,
+ 'syntax' => \$SYNTAX,
+ 'binarize' => \$BINARIZE,
+ 'mark-split' => \$MARK_SPLIT,
'train' => \$TRAIN);
if ($HELP ||
@@ -29,59 +37,152 @@ if ($HELP ||
print "options: -min-size: minimum word size (default $MIN_SIZE)\n";
print " -min-count: minimum word count (default $MIN_COUNT)\n";
print " -filler: filler letters between words (default $FILLER)\n";
+ print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n";
+ print " -syntax: syntactically parsed data (default $SYNTAX)\n";
+ print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n";
+ print " -binarize: binarize subtree for split word (default $BINARIZE)\n";
exit;
}
if ($TRAIN) {
- &train;
+ if ($SYNTAX) { &train_syntax(); }
+ elsif ($FACTORED) { &train_factored(); }
+ else { &train(); }
}
else {
- &apply;
+ &apply();
}
sub train {
- my %WORD;
+ my %COUNT;
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
while(<CORPUS>) {
chop; s/\s+/ /g; s/^ //; s/ $//;
foreach (split) {
- $WORD{$_}++;
+ $COUNT{$_}++;
}
}
- close($CORPUS);
+ close(CORPUS);
+ &save_trained_model(\%COUNT);
+}
+
+sub save_trained_model {
+ my ($COUNT) = @_;
my $id = 0;
open(MODEL,">".$MODEL);
- foreach my $word (keys %WORD) {
- print MODEL "".(++$id)."\t".$word."\t".$WORD{$word}."\n";
+ foreach my $word (keys %$COUNT) {
+ print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n";
}
close(MODEL);
- print STDERR "written model file with ".(scalar keys %WORD)." words.\n";
+ print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n";
+}
+
+sub train_factored {
+ my (%COUNT,%FACTORED_COUNT);
+ # collect counts for interpretations for each surface word
+ open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
+ while(<CORPUS>) {
+ chop; s/\s+/ /g; s/^ //; s/ $//;
+ foreach my $factored_word (split) {
+ my $word = $factored_word;
+ $word =~ s/\|.+//g; # just first factor
+ $FACTORED_COUNT{$word}{$factored_word}++;
+ }
+ }
+ close(CORPUS);
+ # only preserve most frequent interpretation, assign sum of counts
+ foreach my $word (keys %FACTORED_COUNT) {
+ my ($max,$best,$total) = (0,"",0);
+ foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) {
+ my $count = $FACTORED_COUNT{$word}{$factored_word};
+ $total += $count;
+ if ($count > $max) {
+ $max = $count;
+ $best = $factored_word;
+ }
+ }
+ $COUNT{$best} = $total;
+ }
+ &save_trained_model(\%COUNT);
+}
+
+sub train_syntax {
+ my (%COUNT,%LABELED_COUNT);
+ # collect counts for interpretations for each surface word
+ open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
+ while(<CORPUS>) {
+ chop; s/\s+/ /g; s/^ //; s/ $//;
+ my $label;
+ foreach (split) {
+ if (/^label="([^\"]+)"/) {
+ $label = $1;
+ }
+ elsif (! /^</) {
+ $LABELED_COUNT{$_}{$label}++;
+ }
+ }
+ }
+ close(CORPUS);
+
+ # only preserve most frequent label, assign sum of counts
+ foreach my $word (keys %LABELED_COUNT) {
+ my ($max,$best,$total) = (0,"",0);
+ foreach my $label (keys %{$LABELED_COUNT{$word}}) {
+ my $count = $LABELED_COUNT{$word}{$label};
+ $total += $count;
+ if ($count > $max) {
+ $max = $count;
+ $best = "$word $label";
+ }
+ }
+ $COUNT{$best} = $total;
+ }
+ &save_trained_model(\%COUNT);
}
sub apply {
- my (%WORD,%TRUECASE);
+ my (%COUNT,%TRUECASE,%LABEL);
open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'");
while(<MODEL>) {
chomp;
- my ($id,$word,$count) = split(/\t/);
+ my ($id,$factored_word,$count) = split(/\t/);
+ my $label;
+ ($factored_word,$label) = split(/ /,$factored_word);
+ my $word = $factored_word;
+ $word =~ s/\|.+//g; # just first factor
my $lc = lc($word);
# if word exists with multipe casings, only record most frequent
- next if defined($WORD{$lc}) && $WORD{$lc} > $count;
- $WORD{$lc} = $count;
- $TRUECASE{$lc} = $word;
+ next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
+ $COUNT{$lc} = $count;
+ $TRUECASE{$lc} = $factored_word;
+ $LABEL{$lc} = $label if $SYNTAX;
}
close(MODEL);
while(<STDIN>) {
my $first = 1;
chop; s/\s+/ /g; s/^ //; s/ $//;
- foreach my $word (split) {
+ my @BUFFER; # for xml tags
+ foreach my $factored_word (split) {
print " " unless $first;
$first = 0;
+ # syntax: don't split xml
+ if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
+ push @BUFFER,$factored_word;
+ $first = 1;
+ next;
+ }
+
+ # get case class
+ my $word = $factored_word;
+ $word =~ s/\|.+//g; # just first factor
+ my $lc = lc($word);
+
# don't split frequent words
- if (defined($WORD{$word}) && $WORD{$word}>=$MAX_COUNT) {
- print $word;
+ if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
+ print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+ print $factored_word;
next;
}
@@ -100,17 +201,18 @@ sub apply {
my $subword = lc(substr($word,
$start+length($filler),
$end-$start+1-length($filler)));
- next unless defined($WORD{$subword});
- next unless $WORD{$subword} >= $MIN_COUNT;
- print STDERR "\tmatching word $start .. $end ($filler)$subword $WORD{$subword}\n" if $VERBOSE;
- push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $WORD{$subword}";
+ next unless defined($COUNT{$subword});
+ next unless $COUNT{$subword} >= $MIN_COUNT;
+ print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE;
+ push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}";
}
}
}
# no matches at all?
if (!defined($REACHABLE{$final})) {
- print $word;
+ print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+ print $factored_word;
next;
}
@@ -152,9 +254,35 @@ sub apply {
last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final};
for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; }
}
- $best_split = $word unless $best_split =~ / /; # do not change case for unsplit words
- print $best_split;
+ if ($best_split !~ / /) {
+ print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+ print $word; # do not change case for unsplit words
+ next;
+ }
+ if (!$SYNTAX) {
+ print $best_split;
+ }
+ else {
+ $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT;
+ $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n");
+ my $pos = $1;
+ print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+
+ my @SPLIT = split(/ /,$best_split);
+ my @OUT = ();
+ if ($BINARIZE) {
+ for(my $w=0;$w<scalar(@SPLIT)-2;$w++) {
+ push @OUT,"<tree label=\"\@$pos\">";
+ }
+ }
+ for(my $w=0;$w<scalar(@SPLIT);$w++) {
+ if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; }
+ push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>";
+ }
+ print join(" ",@OUT);
+ }
}
+ print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer
print "\n";
}
}