perltidy

author: Hieu Hoang <hieuhoang@gmail.com> 2012-10-19 21:38:46 +0400
committer: Hieu Hoang <hieuhoang@gmail.com> 2012-10-19 21:38:46 +0400
commit: ea48ab7845773a2da2bca48a3dd067ddcffa8f4b (patch)
tree: 8d1be66b3f097cee6c0f0326692f4c98b8999873 /scripts/fuzzy-match/create_xml.perl
parent: 7a19ac861c4f11a9171bfcf66475797f57f27b26 (diff)
1 files changed, 267 insertions, 252 deletions
diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl
index de3d61f71..36715cc35 100755
--- a/scripts/fuzzy-match/create_xml.perl
+++ b/scripts/fuzzy-match/create_xml.perl
@@ -1,7 +1,7 @@
-#!/usr/bin/perl -w 
+#!/usr/bin/perl -w -d
 
-binmode(STDIN, ":utf8");
-binmode(STDOUT, ":utf8");
+binmode( STDIN,  ":utf8" );
+binmode( STDOUT, ":utf8" );
 
 use strict;
 use FindBin qw($RealBin);
@@ -9,49 +9,58 @@ use File::Basename;
 
 sub trim($);
 
+print STDERR "HELLO ";
+for ( my $i = 0 ; $i < scalar @ARGV ; ++$i ) {
+  print STDERR $ARGV[$i] . " ";
+}
+print STDERR "GOODBYE \n";
+
 ############################################
-# START 
+# START
 
 my $inPath = $ARGV[0];
-open(IN,"<".$inPath);
+open( IN, "<" . $inPath );
 
-open(RULE,">$inPath.extract");
-open(RULE_INV,">$inPath.extract.inv");
+open( RULE,     ">$inPath.extract" );
+open( RULE_INV, ">$inPath.extract.inv" );
 
-my ($sentenceInd, $score, $source, $input, $target, $align, $path, $count);
+my ( $sentenceInd, $score, $source, $input, $target, $align, $path, $count );
 
 # MAIN LOOP
-while ($sentenceInd = <IN>) { 
-	$score = <IN>;
-	$source = <IN>;
-	$input = <IN>;
-	$target = <IN>;
-	$align = <IN>;
-	$path = <IN>;
-	$count = <IN>;
-	chomp($sentenceInd);
-	chomp($score);
-	chomp($source);
-	chomp($input);
-	chomp($target);
-	chomp($align);
-	chomp($path);
-	chomp($count);
-	$source = trim($sentenceInd);
-	$source = trim($score);
-	$source = trim($source);
-	$input 	= trim($input);
-	$target	= trim($target);
-	$align	= trim($align);
-	$path	= trim($path);
-	$count	= trim($count);
-	
-	my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = &create_xml($source, $input, $target, $align, $path);
-
-	#print STDOUT $frame."\n";
-	print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $count\n";
-	print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $count\n";
-	#print STDOUT "$sentenceInd ||| $score ||| $count\n";
+while ( $sentenceInd = <IN> ) {
+  $score  = <IN>;
+  $source = <IN>;
+  $input  = <IN>;
+  $target = <IN>;
+  $align  = <IN>;
+  $path   = <IN>;
+  $count  = <IN>;
+  chomp($sentenceInd);
+  chomp($score);
+  chomp($source);
+  chomp($input);
+  chomp($target);
+  chomp($align);
+  chomp($path);
+  chomp($count);
+  $source = trim($sentenceInd);
+  $source = trim($score);
+  $source = trim($source);
+  $input  = trim($input);
+  $target = trim($target);
+  $align  = trim($align);
+  $path   = trim($path);
+  $count  = trim($count);
+
+  my ( $frame, $rule_s, $rule_t, $rule_alignment, $rule_alignment_inv ) =
+    &create_xml( $source, $input, $target, $align, $path );
+
+  #print STDOUT $frame."\n";
+  print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $count\n";
+  print RULE_INV
+    "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $count\n";
+
+  #print STDOUT "$sentenceInd ||| $score ||| $count\n";
 
 }
 
@@ -65,245 +74,251 @@ close(RULE_INV);
 my $lex_file = "-";
 
 my $cmd;
-$cmd = "$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $inPath.extract -lexical-file $lex_file -score-options \"--NoLex\" -phrase-translation-table $inPath.pt";
+$cmd =
+"$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $inPath.extract -lexical-file $lex_file -score-options \"--NoLex\" -phrase-translation-table $inPath.pt";
 print STDERR "Executing: $cmd \n";
 `$cmd`;
 
 #######################################################
 sub create_xml {
-    my ($source,$input,$target,$alignment,$path) = @_;
-    
-    my @INPUT = split(/ /,$input);
-    my @SOURCE = split(/ /,$source);
-    my @TARGET = split(/ /,$target);
-    my %ALIGN = &create_alignment($alignment);
-    
-    my %FRAME_INPUT;
-    my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S);
-    foreach (@TARGET) { push @TARGET_BITMAP,1 }
-    
-    ### STEP 1: FIND MISMATCHES
-
-    my ($s,$i) = (0,0);
-    my $currently_matching = 0;
-    my ($start_s,$start_i) = (0,0);
-
-    $path .= "X"; # indicate end
-    print STDERR "$input\n$source\n$target\n$path\n";
-    for(my $p=0;$p<length($path);$p++) {
-	my $action = substr($path,$p,1);
-
-	# beginning of a mismatch
-	if ($currently_matching && $action ne "M" && $action ne "X") {
-	    $start_i = $i;
-	    $start_s = $s;
-	    $currently_matching = 0;
-	}
-	
-	# end of a mismatch
-	elsif (!$currently_matching && 
-	       ($action eq "M" || $action eq "X")) {
-	    
-	    # remove use of affected target words
-	    for(my $ss = $start_s; $ss<$s; $ss++) {
-		foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
-		    $TARGET_BITMAP[$tt] = 0;
-		}
-		
-		# also remove enclosed unaligned words?
-	    }
-	    
-	    # are there input words that need to be inserted ?
-	    print STDERR "($start_i<$i)?\n";
-	    if ($start_i<$i) {
-		
-		# take note of input words to be inserted
-		my $insertion = "";
-		for(my $ii = $start_i; $ii<$i; $ii++) {
-		    $insertion .= $INPUT[$ii]." ";
-		}
-		
-		# find position for inserted input words
-		
-		# find first removed target word
-		my $start_t = 1000;
-		for(my $ss = $start_s; $ss<$s; $ss++) {
-		    foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
-			$start_t = $tt if $tt < $start_t;
-		    }
-		}
-
-		# end of sentence? add to end
-		if ($start_t == 1000 && $i > $#INPUT) {
-		    $start_t = $#TARGET;
-		}
-		
-		# backtrack to previous words if unaligned
-		if ($start_t == 1000) {
-		    $start_t = -1;
-		    for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
-			foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
-			    $start_t = $tt if $tt > $start_t;
-			}
-		    }
-		}
-		$FRAME_INPUT{$start_t} .= $insertion;
-		my %NT = ("start_t" => $start_t,
-			  "start_i" => $start_i );
-		push @NT,\%NT;		
-	    }	    
-	    $currently_matching = 1;
-	}
-	
-	print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
-	if ($action ne "I") {
-	    print STDERR " ->";
-	    foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
-		print STDERR " ".$tt;
-	    }
-	}
-	print STDERR "\n";
-	$s++ unless $action eq "I";
-	$i++ unless $action eq "D";
-	$ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
-	push @INPUT_BITMAP, 1 if $action eq "M";
-	push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
+  my ( $source, $input, $target, $alignment, $path ) = @_;
+
+  my @INPUT  = split( / /, $input );
+  my @SOURCE = split( / /, $source );
+  my @TARGET = split( / /, $target );
+  my %ALIGN  = &create_alignment($alignment);
+
+  my %FRAME_INPUT;
+  my ( @NT, @INPUT_BITMAP, @TARGET_BITMAP, %ALIGNMENT_I_TO_S );
+  foreach (@TARGET) { push @TARGET_BITMAP, 1 }
+
+  ### STEP 1: FIND MISMATCHES
+
+  my ( $s, $i ) = ( 0, 0 );
+  my $currently_matching = 0;
+  my ( $start_s, $start_i ) = ( 0, 0 );
+
+  $path .= "X";    # indicate end
+  print STDERR "$input\n$source\n$target\n$path\n";
+  for ( my $p = 0 ; $p < length($path) ; $p++ ) {
+    my $action = substr( $path, $p, 1 );
+
+    # beginning of a mismatch
+    if ( $currently_matching && $action ne "M" && $action ne "X" ) {
+      $start_i            = $i;
+      $start_s            = $s;
+      $currently_matching = 0;
     }
-    
 
-    print STDERR $target."\n";
-    foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n";
-    foreach (sort keys %FRAME_INPUT) { 
-	print STDERR "$_: $FRAME_INPUT{$_}\n";
+    # end of a mismatch
+    elsif ( !$currently_matching
+      && ( $action eq "M" || $action eq "X" ) )
+    {
+
+      # remove use of affected target words
+      for ( my $ss = $start_s ; $ss < $s ; $ss++ ) {
+        foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) {
+          $TARGET_BITMAP[$tt] = 0;
+        }
+
+        # also remove enclosed unaligned words?
+      }
+
+      # are there input words that need to be inserted ?
+      print STDERR "($start_i<$i)?\n";
+      if ( $start_i < $i ) {
+
+        # take note of input words to be inserted
+        my $insertion = "";
+        for ( my $ii = $start_i ; $ii < $i ; $ii++ ) {
+          $insertion .= $INPUT[$ii] . " ";
+        }
+
+        # find position for inserted input words
+
+        # find first removed target word
+        my $start_t = 1000;
+        for ( my $ss = $start_s ; $ss < $s ; $ss++ ) {
+          foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) {
+            $start_t = $tt if $tt < $start_t;
+          }
+        }
+
+        # end of sentence? add to end
+        if ( $start_t == 1000 && $i > $#INPUT ) {
+          $start_t = $#TARGET;
+        }
+
+        # backtrack to previous words if unaligned
+        if ( $start_t == 1000 ) {
+          $start_t = -1;
+          for ( my $ss = $s - 1 ; $start_t == -1 && $ss >= 0 ; $ss-- ) {
+            foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) {
+              $start_t = $tt if $tt > $start_t;
+            }
+          }
+        }
+        $FRAME_INPUT{$start_t} .= $insertion;
+        my %NT = (
+          "start_t" => $start_t,
+          "start_i" => $start_i
+        );
+        push @NT, \%NT;
+      }
+      $currently_matching = 1;
     }
 
-    ### STEP 2: BUILD RULE AND FRAME
-        
-    # hierarchical rule
-    my $rule_s = "";
-    my $rule_pos_s = 0;
-    my %RULE_ALIGNMENT_S;
-    for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) {
-		if ($INPUT_BITMAP[$i]) {
-			$rule_s .= $INPUT[$i]." ";
-			$RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++;
-		}
-		foreach my $NT (@NT) {
-			if ($i == $$NT{"start_i"}) {
-				$rule_s .= "[X][X] ";
-				$$NT{"rule_pos_s"} = $rule_pos_s++;
-			}
-		}
+    print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
+    if ( $action ne "I" ) {
+      print STDERR " ->";
+      foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$s] } ) {
+        print STDERR " " . $tt;
+      }
     }
+    print STDERR "\n";
+    $s++ unless $action eq "I";
+    $i++ unless $action eq "D";
+    $ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
+    push @INPUT_BITMAP, 1 if $action eq "M";
+    push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
+  }
+
+  print STDERR $target . "\n";
+  foreach (@TARGET_BITMAP) { print STDERR $_; }
+  print STDERR "\n";
+  foreach ( sort keys %FRAME_INPUT ) {
+    print STDERR "$_: $FRAME_INPUT{$_}\n";
+  }
+
+  ### STEP 2: BUILD RULE AND FRAME
 
-    my $rule_t = "";
-    my $rule_pos_t = 0;
-    my %RULE_ALIGNMENT_T;
-    for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) {
-	if ($t>=0 && $TARGET_BITMAP[$t]) {
-	    $rule_t .= $TARGET[$t]." ";
-	    $RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
-	}
-	foreach my $NT (@NT) {
-	    if ($t == $$NT{"start_t"}) {
-		$rule_t .= "[X][X] ";
-		$$NT{"rule_pos_t"} = $rule_pos_t++;
-	    }
-	}
+  # hierarchical rule
+  my $rule_s     = "";
+  my $rule_pos_s = 0;
+  my %RULE_ALIGNMENT_S;
+  for ( my $i = 0 ; $i < scalar(@INPUT_BITMAP) ; $i++ ) {
+    if ( $INPUT_BITMAP[$i] ) {
+      $rule_s .= $INPUT[$i] . " ";
+      $RULE_ALIGNMENT_S{ $ALIGNMENT_I_TO_S{$i} } = $rule_pos_s++;
+    }
+    foreach my $NT (@NT) {
+      if ( $i == $$NT{"start_i"} ) {
+        $rule_s .= "[X][X] ";
+        $$NT{"rule_pos_s"} = $rule_pos_s++;
+      }
     }
+  }
 
-    my $rule_alignment = "";
-    foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) {
-	foreach my $t (keys %{$ALIGN{"s"}[$s]}) {
-	    next unless defined($RULE_ALIGNMENT_T{$t});
-	    $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." ";
-	}
+  my $rule_t     = "";
+  my $rule_pos_t = 0;
+  my %RULE_ALIGNMENT_T;
+  for ( my $t = -1 ; $t < scalar(@TARGET_BITMAP) ; $t++ ) {
+    if ( $t >= 0 && $TARGET_BITMAP[$t] ) {
+      $rule_t .= $TARGET[$t] . " ";
+      $RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
     }
     foreach my $NT (@NT) {
-	$rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." ";
+      if ( $t == $$NT{"start_t"} ) {
+        $rule_t .= "[X][X] ";
+        $$NT{"rule_pos_t"} = $rule_pos_t++;
+      }
     }
-    
-    chop($rule_s);
-    chop($rule_t);
-    chop($rule_alignment);
-
-    my $rule_alignment_inv = "";
-    foreach (split(/ /,$rule_alignment)) {
-	/^(\d+)\-(\d+)$/;
-	$rule_alignment_inv .= "$2-$1 ";
+  }
+
+  my $rule_alignment = "";
+  foreach my $s ( sort { $a <=> $b } keys %RULE_ALIGNMENT_S ) {
+    foreach my $t ( keys %{ $ALIGN{"s"}[$s] } ) {
+      next unless defined( $RULE_ALIGNMENT_T{$t} );
+      $rule_alignment .=
+        $RULE_ALIGNMENT_S{$s} . "-" . $RULE_ALIGNMENT_T{$t} . " ";
+    }
+  }
+  foreach my $NT (@NT) {
+    $rule_alignment .= $$NT{"rule_pos_s"} . "-" . $$NT{"rule_pos_t"} . " ";
+  }
+
+  chop($rule_s);
+  chop($rule_t);
+  chop($rule_alignment);
+
+  my $rule_alignment_inv = "";
+  foreach ( split( / /, $rule_alignment ) ) {
+    /^(\d+)\-(\d+)$/;
+    $rule_alignment_inv .= "$2-$1 ";
+  }
+  chop($rule_alignment_inv);
+
+  # frame
+  my $frame = "";
+  $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
+
+  my $currently_included = 0;
+  my $start_t            = -1;
+  push @TARGET_BITMAP, 0;    # indicate end
+
+  for ( my $t = 0 ; $t <= scalar(@TARGET) ; $t++ ) {
+
+    # beginning of tm target inclusion
+    if ( !$currently_included && $TARGET_BITMAP[$t] ) {
+      $start_t            = $t;
+      $currently_included = 1;
     }
-    chop($rule_alignment_inv);
-
-    # frame
-    my $frame = "";
-    $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
-
-    my $currently_included = 0;
-    my $start_t = -1;
-    push @TARGET_BITMAP,0; # indicate end
-
-    for(my $t=0;$t<=scalar(@TARGET);$t++) {	    
-	# beginning of tm target inclusion
-	if (!$currently_included && $TARGET_BITMAP[$t]) {
-	    $start_t = $t;
-	    $currently_included = 1;
-	}
-	
-	# end of tm target inclusion (not included word or inserted input)
-	elsif ($currently_included && 
-	       (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
-	    # add xml (unless change is at the beginning of the sentence
-	    if ($start_t >= 0) {
-		my $target = "";
-		print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
-		for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
-		    $target .= $TARGET[$tt] . " ";
-		}
-		chop($target);
-		$frame .= "<xml translation=\"$target\"> x </xml> ";
-	    }
-	    $currently_included = 0;
-	}
-	
-	$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
-	print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
+
+    # end of tm target inclusion (not included word or inserted input)
+    elsif ( $currently_included
+      && ( !$TARGET_BITMAP[$t] || defined( $FRAME_INPUT{$t} ) ) )
+    {
+      # add xml (unless change is at the beginning of the sentence
+      if ( $start_t >= 0 ) {
+        my $target = "";
+        print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
+        for ( my $tt = $start_t ; $tt < $t + $TARGET_BITMAP[$t] ; $tt++ ) {
+          $target .= $TARGET[$tt] . " ";
+        }
+        chop($target);
+        $frame .= "<xml translation=\"$target\"> x </xml> ";
+      }
+      $currently_included = 0;
     }
 
-    print STDERR $frame."\n-------------------------------------\n";
-    return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv);
+    $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
+    print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
+  }
+
+  print STDERR $frame . "\n-------------------------------------\n";
+  return ( $frame, $rule_s, $rule_t, $rule_alignment, $rule_alignment_inv );
 }
 
 sub create_alignment {
-	my ($line) = @_;
-	my (@ALIGNED_TO_S,@ALIGNED_TO_T);
-	foreach my $point (split(/ /,$line)) {
-		my ($s,$t) = split(/\-/,$point);
-		$ALIGNED_TO_S[$s]{$t}++;
-		$ALIGNED_TO_T[$t]{$s}++;
-	}
-	my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
-	return %ALIGNMENT;
+  my ($line) = @_;
+  my ( @ALIGNED_TO_S, @ALIGNED_TO_T );
+  foreach my $point ( split( / /, $line ) ) {
+    my ( $s, $t ) = split( /\-/, $point );
+    $ALIGNED_TO_S[$s]{$t}++;
+    $ALIGNED_TO_T[$t]{$s}++;
+  }
+  my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
+  return %ALIGNMENT;
 }
 
 # Perl trim function to remove whitespace from the start and end of the string
-sub trim($)
-{
-	my $string = shift;
-	$string =~ s/^\s+//;
-	$string =~ s/\s+$//;
-	return $string;
+sub trim($) {
+  my $string = shift;
+  $string =~ s/^\s+//;
+  $string =~ s/\s+$//;
+  return $string;
 }
+
 # Left trim function to remove leading whitespace
-sub ltrim($)
-{
-	my $string = shift;
-	$string =~ s/^\s+//;
-	return $string;
+sub ltrim($) {
+  my $string = shift;
+  $string =~ s/^\s+//;
+  return $string;
 }
+
 # Right trim function to remove trailing whitespace
-sub rtrim($)
-{
-	my $string = shift;
-	$string =~ s/\s+$//;
-	return $string;
-}
-\ No newline at end of file
+sub rtrim($) {
+  my $string = shift;
+  $string =~ s/\s+$//;
+  return $string;
+}
author	Hieu Hoang <hieuhoang@gmail.com>	2012-10-19 21:38:46 +0400
committer	Hieu Hoang <hieuhoang@gmail.com>	2012-10-19 21:38:46 +0400
commit	ea48ab7845773a2da2bca48a3dd067ddcffa8f4b (patch)
tree	8d1be66b3f097cee6c0f0326692f4c98b8999873 /scripts/fuzzy-match/create_xml.perl
parent	7a19ac861c4f11a9171bfcf66475797f57f27b26 (diff)