Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2013-06-09 17:28:56 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2013-06-09 17:28:56 +0400
commitce372477c96e1ce4d36a1183a1501973fe368b48 (patch)
tree397c4ccba04e4828bf7997cc8e867dc01f30775b /scripts/training
parent2e8fbe77a2dfebf9abbc7f7c5c6adaa99f58de6b (diff)
conversion script from Moses V1.0 moses.ini files to current format - may need some further tweaking
Diffstat (limited to 'scripts/training')
-rwxr-xr-xscripts/training/convert-moses-ini-to-v2.perl169
1 files changed, 169 insertions, 0 deletions
diff --git a/scripts/training/convert-moses-ini-to-v2.perl b/scripts/training/convert-moses-ini-to-v2.perl
new file mode 100755
index 000000000..50e2ee78f
--- /dev/null
+++ b/scripts/training/convert-moses-ini-to-v2.perl
@@ -0,0 +1,169 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my $header = "";
+my @INI = <STDIN>;
+
+my %TTABLE_IMPLEMENTATION = ( 0 => "PhraseDictionaryMemory",
+ 1 => "PhraseDictionaryBinary" );
+my %LM_IMPLEMENTATION = ( 0 => "SRILM",
+ 8 => "KENLM lazyken=0" );
+
+
+my (%FEATURE,%WEIGHT);
+my $i=0;
+
+for(; $i<scalar(@INI); $i++) {
+ my $line = $INI[$i];
+ if ($line =~ /^\[(.+)\]/) {
+ my $section = $1;
+ if ($section eq "ttable-file" ||
+ $section eq "distortion-file" ||
+ $section eq "generation-file" ||
+ $section eq "lmodel-file" ||
+ $section eq "ttable-limit" ||
+ $section eq "target-word-insertion-feature" ||
+ $section eq "source-word-deletion-feature" ||
+ $section eq "word-translation-feature" ||
+ $section eq "phrase-length-feature") {
+ $FEATURE{$section} = &get_data();
+ }
+ elsif ($section =~ /weight-(.+)/ && $section ne "weight-file") {
+ $WEIGHT{$1} = &get_data();
+ }
+ elsif ($section eq "report-sparse-features") {}
+ else {
+ print STDERR "include section [$section] verbatim.\n";
+ print $header.$line;
+ my $SECTION = &get_data();
+ foreach (@{$SECTION}) {
+ print $_."\n";
+ }
+ }
+ $header = "";
+ }
+ else {
+ $header .= $line;
+ }
+}
+print $header;
+
+my ($feature,$weight) = ("","");
+$feature .= "UnknownWordPenalty\n";
+$weight .= "UnknownWordPenalty0= 1\n";
+
+$feature .= "WordPenalty\n";
+$weight .= "WordPenalty0= ".$WEIGHT{"w"}[0]."\n";
+
+$feature .= "Distortion\n";
+$weight .= "Distortion0= ".$WEIGHT{"d"}[0]."\n";
+
+foreach my $section (keys %FEATURE) {
+ if ($section eq "phrase-length-feature") {
+ $feature .= "PhraseLengthFeature name=pl\n";
+ }
+ elsif ($section eq "target-word-insertion-feature") {
+ my ($factor,$file) = split(/ /,$FEATURE{$section}[0]);
+ $feature .= "TargetWordInsertionFeature name=twi factor=$factor";
+ $feature .= " path=$file" if defined($file);
+ $feature .= "\n";
+ }
+ elsif ($section eq "source-word-insertion-feature") {
+ my ($factor,$file) = split(/ /,$FEATURE{$section}[0]);
+ $feature .= "SourceWordDeletionFeature name=swd factor=$file";
+ $feature .= " path=$file" if defined($file);
+ $feature .= "\n";
+ }
+ elsif ($section eq "word-translation-feature") {
+ my ($factors,$simple,$dummy1,$dummy2,$dummy3,$dummy4,$file_f,$file_e) = split(/ /,$FEATURE{$section}[0]);
+ my ($input_factor,$output_factor) = split(/\-/, $factors);
+ $feature .= "WordTranslationFeature name=wt input-factor=$input_factor output-factor=$output_factor simple=$simple source-context=0 target-context=0";
+ $feature .= " source-path=$file_f target-path=$file_e" if defined($file_f);
+ $feature .= "\n";
+ }
+ elsif ($section eq "ttable-file") {
+ my $i = 0;
+ my @TTABLE_LIMIT = @{$FEATURE{"ttable-limit"}};
+ my @W = @{$WEIGHT{"t"}};
+ foreach my $line (@{$FEATURE{$section}}) {
+ my ($imp, $input_factor, $output_factor, $weight_count, $file) = split(/ /,$line);
+ my $implementation = $TTABLE_IMPLEMENTATION{$imp};
+ if (!defined($implementation)) {
+ print STDERR "ERROR: Unknown translation table implementation: $implementation\n";
+ $implementation = "UNKNOWN";
+ }
+ $feature .= "$implementation name=TranslationModel$i num-features=$weight_count path=$file input-factor=$input_factor output-factor=$output_factor";
+ $feature .= " ttable-limit=".$TTABLE_LIMIT[$i] if $#TTABLE_LIMIT >= $i;
+ $feature .= "\n";
+ $weight .= "TranslationModel$i=".&get_weights(\@W,$weight_count)."\n";
+ $i++;
+ }
+ }
+ elsif ($section eq "generation-file") {
+ my $i = 0;
+ my @W = @{$WEIGHT{"generation"}};
+ foreach my $line (@{$FEATURE{$section}}) {
+ my ($input_factor,$output_factor,$weight_count,$file) = split(/ /,$line);
+ $feature .= "Generation name=GenerationModel$i num-features=$weight_count path=$file input-factor=$input_factor output-factor=$output_factor\n";
+ $weight .= "GenerationModel$i=".&get_weights(\@W,$weight_count)."\n";
+ $i++;
+ }
+ }
+ elsif ($section eq "distortion-file") {
+ my $i = 0;
+ my @W = @{$WEIGHT{"d"}};
+ my $ignore = shift @W;
+ foreach my $line (@{$FEATURE{$section}}) {
+ my ($factors,$type,$weight_count,$file) = split(/ /,$line);
+ my ($input_factor,$output_factor) = split(/\-/, $factors);
+ $feature .= "LexicalReordering name=LexicalReordering$i num-features=$weight_count type=$type input-factor=$input_factor output-factor=$output_factor path=$file\n";
+ $weight .= "LexicalReordering$i=".&get_weights(\@W,$weight_count)."\n";
+ $i++;
+ }
+ }
+
+ elsif ($section eq "lmodel-file") {
+ my $i = 0;
+ my @W = @{$WEIGHT{"l"}};
+ foreach my $line (@{$FEATURE{$section}}) {
+ my ($imp,$factor,$order,$file) = split(/ /,$line);
+ my $implementation = $LM_IMPLEMENTATION{$imp};
+ if (!defined($implementation)) {
+ print STDERR "ERROR: Unknown language model implementation: $implementation\n";
+ $implementation = "UNKNOWN";
+ }
+ $feature .= "$implementation name=LM$i factor=$factor path=$file order=$order\n";
+ $weight .= "LM$i=".&get_weights(\@W,1)."\n";
+ $i++;
+ }
+ }
+}
+
+print "\n[feature]\n$feature\n";
+print "\n[weight]\n$weight\n";
+
+sub get_data {
+ my ($pattern) = @_;
+ my @DATA;
+ while (++$i < scalar(@INI) &&
+ $INI[$i] !~ /^\s*$/ &&
+ $INI[$i] !~ /^\[/ &&
+ $INI[$i] !~ /^\#/) {
+ push @DATA,$INI[$i];
+ }
+ $i--;
+ chop(@DATA);
+ return \@DATA;
+}
+
+sub get_weights {
+ my ($W,$count) = @_;
+ my $list = "";
+ for(my $w=0;$w<$count;$w++) {
+ my $value = shift @{$W};
+ chop($value);
+ $list .= " $value";
+ }
+ return $list;
+}