Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2012-10-19 21:38:46 +0400
committerHieu Hoang <hieuhoang@gmail.com>2012-10-19 21:38:46 +0400
commitea48ab7845773a2da2bca48a3dd067ddcffa8f4b (patch)
tree8d1be66b3f097cee6c0f0326692f4c98b8999873 /scripts/fuzzy-match/create_xml.perl
parent7a19ac861c4f11a9171bfcf66475797f57f27b26 (diff)
perltidy
Diffstat (limited to 'scripts/fuzzy-match/create_xml.perl')
-rwxr-xr-xscripts/fuzzy-match/create_xml.perl519
1 files changed, 267 insertions, 252 deletions
diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl
index de3d61f71..36715cc35 100755
--- a/scripts/fuzzy-match/create_xml.perl
+++ b/scripts/fuzzy-match/create_xml.perl
@@ -1,7 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl -w -d
-binmode(STDIN, ":utf8");
-binmode(STDOUT, ":utf8");
+binmode( STDIN, ":utf8" );
+binmode( STDOUT, ":utf8" );
use strict;
use FindBin qw($RealBin);
@@ -9,49 +9,58 @@ use File::Basename;
sub trim($);
+print STDERR "HELLO ";
+for ( my $i = 0 ; $i < scalar @ARGV ; ++$i ) {
+ print STDERR $ARGV[$i] . " ";
+}
+print STDERR "GOODBYE \n";
+
############################################
-# START
+# START
my $inPath = $ARGV[0];
-open(IN,"<".$inPath);
+open( IN, "<" . $inPath );
-open(RULE,">$inPath.extract");
-open(RULE_INV,">$inPath.extract.inv");
+open( RULE, ">$inPath.extract" );
+open( RULE_INV, ">$inPath.extract.inv" );
-my ($sentenceInd, $score, $source, $input, $target, $align, $path, $count);
+my ( $sentenceInd, $score, $source, $input, $target, $align, $path, $count );
# MAIN LOOP
-while ($sentenceInd = <IN>) {
- $score = <IN>;
- $source = <IN>;
- $input = <IN>;
- $target = <IN>;
- $align = <IN>;
- $path = <IN>;
- $count = <IN>;
- chomp($sentenceInd);
- chomp($score);
- chomp($source);
- chomp($input);
- chomp($target);
- chomp($align);
- chomp($path);
- chomp($count);
- $source = trim($sentenceInd);
- $source = trim($score);
- $source = trim($source);
- $input = trim($input);
- $target = trim($target);
- $align = trim($align);
- $path = trim($path);
- $count = trim($count);
-
- my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = &create_xml($source, $input, $target, $align, $path);
-
- #print STDOUT $frame."\n";
- print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $count\n";
- print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $count\n";
- #print STDOUT "$sentenceInd ||| $score ||| $count\n";
+while ( $sentenceInd = <IN> ) {
+ $score = <IN>;
+ $source = <IN>;
+ $input = <IN>;
+ $target = <IN>;
+ $align = <IN>;
+ $path = <IN>;
+ $count = <IN>;
+ chomp($sentenceInd);
+ chomp($score);
+ chomp($source);
+ chomp($input);
+ chomp($target);
+ chomp($align);
+ chomp($path);
+ chomp($count);
+ $source = trim($sentenceInd);
+ $source = trim($score);
+ $source = trim($source);
+ $input = trim($input);
+ $target = trim($target);
+ $align = trim($align);
+ $path = trim($path);
+ $count = trim($count);
+
+ my ( $frame, $rule_s, $rule_t, $rule_alignment, $rule_alignment_inv ) =
+ &create_xml( $source, $input, $target, $align, $path );
+
+ #print STDOUT $frame."\n";
+ print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $count\n";
+ print RULE_INV
+ "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $count\n";
+
+ #print STDOUT "$sentenceInd ||| $score ||| $count\n";
}
@@ -65,245 +74,251 @@ close(RULE_INV);
my $lex_file = "-";
my $cmd;
-$cmd = "$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $inPath.extract -lexical-file $lex_file -score-options \"--NoLex\" -phrase-translation-table $inPath.pt";
+$cmd =
+"$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $inPath.extract -lexical-file $lex_file -score-options \"--NoLex\" -phrase-translation-table $inPath.pt";
print STDERR "Executing: $cmd \n";
`$cmd`;
#######################################################
sub create_xml {
- my ($source,$input,$target,$alignment,$path) = @_;
-
- my @INPUT = split(/ /,$input);
- my @SOURCE = split(/ /,$source);
- my @TARGET = split(/ /,$target);
- my %ALIGN = &create_alignment($alignment);
-
- my %FRAME_INPUT;
- my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S);
- foreach (@TARGET) { push @TARGET_BITMAP,1 }
-
- ### STEP 1: FIND MISMATCHES
-
- my ($s,$i) = (0,0);
- my $currently_matching = 0;
- my ($start_s,$start_i) = (0,0);
-
- $path .= "X"; # indicate end
- print STDERR "$input\n$source\n$target\n$path\n";
- for(my $p=0;$p<length($path);$p++) {
- my $action = substr($path,$p,1);
-
- # beginning of a mismatch
- if ($currently_matching && $action ne "M" && $action ne "X") {
- $start_i = $i;
- $start_s = $s;
- $currently_matching = 0;
- }
-
- # end of a mismatch
- elsif (!$currently_matching &&
- ($action eq "M" || $action eq "X")) {
-
- # remove use of affected target words
- for(my $ss = $start_s; $ss<$s; $ss++) {
- foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
- $TARGET_BITMAP[$tt] = 0;
- }
-
- # also remove enclosed unaligned words?
- }
-
- # are there input words that need to be inserted ?
- print STDERR "($start_i<$i)?\n";
- if ($start_i<$i) {
-
- # take note of input words to be inserted
- my $insertion = "";
- for(my $ii = $start_i; $ii<$i; $ii++) {
- $insertion .= $INPUT[$ii]." ";
- }
-
- # find position for inserted input words
-
- # find first removed target word
- my $start_t = 1000;
- for(my $ss = $start_s; $ss<$s; $ss++) {
- foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
- $start_t = $tt if $tt < $start_t;
- }
- }
-
- # end of sentence? add to end
- if ($start_t == 1000 && $i > $#INPUT) {
- $start_t = $#TARGET;
- }
-
- # backtrack to previous words if unaligned
- if ($start_t == 1000) {
- $start_t = -1;
- for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
- foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
- $start_t = $tt if $tt > $start_t;
- }
- }
- }
- $FRAME_INPUT{$start_t} .= $insertion;
- my %NT = ("start_t" => $start_t,
- "start_i" => $start_i );
- push @NT,\%NT;
- }
- $currently_matching = 1;
- }
-
- print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
- if ($action ne "I") {
- print STDERR " ->";
- foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
- print STDERR " ".$tt;
- }
- }
- print STDERR "\n";
- $s++ unless $action eq "I";
- $i++ unless $action eq "D";
- $ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
- push @INPUT_BITMAP, 1 if $action eq "M";
- push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
+ my ( $source, $input, $target, $alignment, $path ) = @_;
+
+ my @INPUT = split( / /, $input );
+ my @SOURCE = split( / /, $source );
+ my @TARGET = split( / /, $target );
+ my %ALIGN = &create_alignment($alignment);
+
+ my %FRAME_INPUT;
+ my ( @NT, @INPUT_BITMAP, @TARGET_BITMAP, %ALIGNMENT_I_TO_S );
+ foreach (@TARGET) { push @TARGET_BITMAP, 1 }
+
+ ### STEP 1: FIND MISMATCHES
+
+ my ( $s, $i ) = ( 0, 0 );
+ my $currently_matching = 0;
+ my ( $start_s, $start_i ) = ( 0, 0 );
+
+ $path .= "X"; # indicate end
+ print STDERR "$input\n$source\n$target\n$path\n";
+ for ( my $p = 0 ; $p < length($path) ; $p++ ) {
+ my $action = substr( $path, $p, 1 );
+
+ # beginning of a mismatch
+ if ( $currently_matching && $action ne "M" && $action ne "X" ) {
+ $start_i = $i;
+ $start_s = $s;
+ $currently_matching = 0;
}
-
- print STDERR $target."\n";
- foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n";
- foreach (sort keys %FRAME_INPUT) {
- print STDERR "$_: $FRAME_INPUT{$_}\n";
+ # end of a mismatch
+ elsif ( !$currently_matching
+ && ( $action eq "M" || $action eq "X" ) )
+ {
+
+ # remove use of affected target words
+ for ( my $ss = $start_s ; $ss < $s ; $ss++ ) {
+ foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) {
+ $TARGET_BITMAP[$tt] = 0;
+ }
+
+ # also remove enclosed unaligned words?
+ }
+
+ # are there input words that need to be inserted ?
+ print STDERR "($start_i<$i)?\n";
+ if ( $start_i < $i ) {
+
+ # take note of input words to be inserted
+ my $insertion = "";
+ for ( my $ii = $start_i ; $ii < $i ; $ii++ ) {
+ $insertion .= $INPUT[$ii] . " ";
+ }
+
+ # find position for inserted input words
+
+ # find first removed target word
+ my $start_t = 1000;
+ for ( my $ss = $start_s ; $ss < $s ; $ss++ ) {
+ foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) {
+ $start_t = $tt if $tt < $start_t;
+ }
+ }
+
+ # end of sentence? add to end
+ if ( $start_t == 1000 && $i > $#INPUT ) {
+ $start_t = $#TARGET;
+ }
+
+ # backtrack to previous words if unaligned
+ if ( $start_t == 1000 ) {
+ $start_t = -1;
+ for ( my $ss = $s - 1 ; $start_t == -1 && $ss >= 0 ; $ss-- ) {
+ foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) {
+ $start_t = $tt if $tt > $start_t;
+ }
+ }
+ }
+ $FRAME_INPUT{$start_t} .= $insertion;
+ my %NT = (
+ "start_t" => $start_t,
+ "start_i" => $start_i
+ );
+ push @NT, \%NT;
+ }
+ $currently_matching = 1;
}
- ### STEP 2: BUILD RULE AND FRAME
-
- # hierarchical rule
- my $rule_s = "";
- my $rule_pos_s = 0;
- my %RULE_ALIGNMENT_S;
- for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) {
- if ($INPUT_BITMAP[$i]) {
- $rule_s .= $INPUT[$i]." ";
- $RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++;
- }
- foreach my $NT (@NT) {
- if ($i == $$NT{"start_i"}) {
- $rule_s .= "[X][X] ";
- $$NT{"rule_pos_s"} = $rule_pos_s++;
- }
- }
+ print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
+ if ( $action ne "I" ) {
+ print STDERR " ->";
+ foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$s] } ) {
+ print STDERR " " . $tt;
+ }
}
+ print STDERR "\n";
+ $s++ unless $action eq "I";
+ $i++ unless $action eq "D";
+ $ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
+ push @INPUT_BITMAP, 1 if $action eq "M";
+ push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
+ }
+
+ print STDERR $target . "\n";
+ foreach (@TARGET_BITMAP) { print STDERR $_; }
+ print STDERR "\n";
+ foreach ( sort keys %FRAME_INPUT ) {
+ print STDERR "$_: $FRAME_INPUT{$_}\n";
+ }
+
+ ### STEP 2: BUILD RULE AND FRAME
- my $rule_t = "";
- my $rule_pos_t = 0;
- my %RULE_ALIGNMENT_T;
- for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) {
- if ($t>=0 && $TARGET_BITMAP[$t]) {
- $rule_t .= $TARGET[$t]." ";
- $RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
- }
- foreach my $NT (@NT) {
- if ($t == $$NT{"start_t"}) {
- $rule_t .= "[X][X] ";
- $$NT{"rule_pos_t"} = $rule_pos_t++;
- }
- }
+ # hierarchical rule
+ my $rule_s = "";
+ my $rule_pos_s = 0;
+ my %RULE_ALIGNMENT_S;
+ for ( my $i = 0 ; $i < scalar(@INPUT_BITMAP) ; $i++ ) {
+ if ( $INPUT_BITMAP[$i] ) {
+ $rule_s .= $INPUT[$i] . " ";
+ $RULE_ALIGNMENT_S{ $ALIGNMENT_I_TO_S{$i} } = $rule_pos_s++;
+ }
+ foreach my $NT (@NT) {
+ if ( $i == $$NT{"start_i"} ) {
+ $rule_s .= "[X][X] ";
+ $$NT{"rule_pos_s"} = $rule_pos_s++;
+ }
}
+ }
- my $rule_alignment = "";
- foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) {
- foreach my $t (keys %{$ALIGN{"s"}[$s]}) {
- next unless defined($RULE_ALIGNMENT_T{$t});
- $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." ";
- }
+ my $rule_t = "";
+ my $rule_pos_t = 0;
+ my %RULE_ALIGNMENT_T;
+ for ( my $t = -1 ; $t < scalar(@TARGET_BITMAP) ; $t++ ) {
+ if ( $t >= 0 && $TARGET_BITMAP[$t] ) {
+ $rule_t .= $TARGET[$t] . " ";
+ $RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
}
foreach my $NT (@NT) {
- $rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." ";
+ if ( $t == $$NT{"start_t"} ) {
+ $rule_t .= "[X][X] ";
+ $$NT{"rule_pos_t"} = $rule_pos_t++;
+ }
}
-
- chop($rule_s);
- chop($rule_t);
- chop($rule_alignment);
-
- my $rule_alignment_inv = "";
- foreach (split(/ /,$rule_alignment)) {
- /^(\d+)\-(\d+)$/;
- $rule_alignment_inv .= "$2-$1 ";
+ }
+
+ my $rule_alignment = "";
+ foreach my $s ( sort { $a <=> $b } keys %RULE_ALIGNMENT_S ) {
+ foreach my $t ( keys %{ $ALIGN{"s"}[$s] } ) {
+ next unless defined( $RULE_ALIGNMENT_T{$t} );
+ $rule_alignment .=
+ $RULE_ALIGNMENT_S{$s} . "-" . $RULE_ALIGNMENT_T{$t} . " ";
+ }
+ }
+ foreach my $NT (@NT) {
+ $rule_alignment .= $$NT{"rule_pos_s"} . "-" . $$NT{"rule_pos_t"} . " ";
+ }
+
+ chop($rule_s);
+ chop($rule_t);
+ chop($rule_alignment);
+
+ my $rule_alignment_inv = "";
+ foreach ( split( / /, $rule_alignment ) ) {
+ /^(\d+)\-(\d+)$/;
+ $rule_alignment_inv .= "$2-$1 ";
+ }
+ chop($rule_alignment_inv);
+
+ # frame
+ my $frame = "";
+ $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
+
+ my $currently_included = 0;
+ my $start_t = -1;
+ push @TARGET_BITMAP, 0; # indicate end
+
+ for ( my $t = 0 ; $t <= scalar(@TARGET) ; $t++ ) {
+
+ # beginning of tm target inclusion
+ if ( !$currently_included && $TARGET_BITMAP[$t] ) {
+ $start_t = $t;
+ $currently_included = 1;
}
- chop($rule_alignment_inv);
-
- # frame
- my $frame = "";
- $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
-
- my $currently_included = 0;
- my $start_t = -1;
- push @TARGET_BITMAP,0; # indicate end
-
- for(my $t=0;$t<=scalar(@TARGET);$t++) {
- # beginning of tm target inclusion
- if (!$currently_included && $TARGET_BITMAP[$t]) {
- $start_t = $t;
- $currently_included = 1;
- }
-
- # end of tm target inclusion (not included word or inserted input)
- elsif ($currently_included &&
- (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
- # add xml (unless change is at the beginning of the sentence
- if ($start_t >= 0) {
- my $target = "";
- print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
- for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
- $target .= $TARGET[$tt] . " ";
- }
- chop($target);
- $frame .= "<xml translation=\"$target\"> x </xml> ";
- }
- $currently_included = 0;
- }
-
- $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
- print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
+
+ # end of tm target inclusion (not included word or inserted input)
+ elsif ( $currently_included
+ && ( !$TARGET_BITMAP[$t] || defined( $FRAME_INPUT{$t} ) ) )
+ {
+ # add xml (unless change is at the beginning of the sentence
+ if ( $start_t >= 0 ) {
+ my $target = "";
+ print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
+ for ( my $tt = $start_t ; $tt < $t + $TARGET_BITMAP[$t] ; $tt++ ) {
+ $target .= $TARGET[$tt] . " ";
+ }
+ chop($target);
+ $frame .= "<xml translation=\"$target\"> x </xml> ";
+ }
+ $currently_included = 0;
}
- print STDERR $frame."\n-------------------------------------\n";
- return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv);
+ $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
+ print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
+ }
+
+ print STDERR $frame . "\n-------------------------------------\n";
+ return ( $frame, $rule_s, $rule_t, $rule_alignment, $rule_alignment_inv );
}
sub create_alignment {
- my ($line) = @_;
- my (@ALIGNED_TO_S,@ALIGNED_TO_T);
- foreach my $point (split(/ /,$line)) {
- my ($s,$t) = split(/\-/,$point);
- $ALIGNED_TO_S[$s]{$t}++;
- $ALIGNED_TO_T[$t]{$s}++;
- }
- my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
- return %ALIGNMENT;
+ my ($line) = @_;
+ my ( @ALIGNED_TO_S, @ALIGNED_TO_T );
+ foreach my $point ( split( / /, $line ) ) {
+ my ( $s, $t ) = split( /\-/, $point );
+ $ALIGNED_TO_S[$s]{$t}++;
+ $ALIGNED_TO_T[$t]{$s}++;
+ }
+ my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
+ return %ALIGNMENT;
}
# Perl trim function to remove whitespace from the start and end of the string
-sub trim($)
-{
- my $string = shift;
- $string =~ s/^\s+//;
- $string =~ s/\s+$//;
- return $string;
+sub trim($) {
+ my $string = shift;
+ $string =~ s/^\s+//;
+ $string =~ s/\s+$//;
+ return $string;
}
+
# Left trim function to remove leading whitespace
-sub ltrim($)
-{
- my $string = shift;
- $string =~ s/^\s+//;
- return $string;
+sub ltrim($) {
+ my $string = shift;
+ $string =~ s/^\s+//;
+ return $string;
}
+
# Right trim function to remove trailing whitespace
-sub rtrim($)
-{
- my $string = shift;
- $string =~ s/\s+$//;
- return $string;
-} \ No newline at end of file
+sub rtrim($) {
+ my $string = shift;
+ $string =~ s/\s+$//;
+ return $string;
+}