improvements to web analysis, fixes to syntax wrappers

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3633 1f5c12ca-751b-0410-a591-d2e778427230
author: phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> 2010-10-21 13:49:27 +0400
committer: phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> 2010-10-21 13:49:27 +0400
commit: 85a5a13e4c722fccc28394d06da1ea0194bf7ab6 (patch)
tree: 97876a77307d0a21bbf0be6ea913c1a9f067a225 /scripts/training/clean-corpus-n.perl
parent: 88eaf49c5e051f7c1202d01aa2136c811c17e401 (diff)
1 files changed, 23 insertions, 9 deletions
diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index df77e5f89..bea32052a 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -6,6 +6,7 @@ use Getopt::Long;
 my $help;
 my $lc = 0; # lowercase the corpus?
 my $ignore_ratio = 0;
+my $ignore_xml = 0;
 my $enc = "utf8"; # encoding of the input and output files
     # set to anything else you wish, but I have not tested it yet
 my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
@@ -17,6 +18,7 @@ GetOptions(
   "lowercase|lc" => \$lc,
   "encoding=s" => \$enc,
   "ignore-ratio" => \$ignore_ratio,
+  "ignore-xml" => \$ignore_xml,
   "max-word-length|mwl=s" => \$max_word_length
 ) or exit(1);
 
@@ -108,14 +110,15 @@ while(my $f = <F>) {
   $f =~ s/ $//;
   next if $f eq '';
   next if $e eq '';
-  my @E = split(/ /,$e);
-  my @F = split(/ /,$f);
-  next if scalar(@E) > $max;
-  next if scalar(@F) > $max;
-  next if scalar(@E) < $min;
-  next if scalar(@F) < $min;
-  next if !$ignore_ratio && scalar(@E)/scalar(@F) > 9;
-  next if !$ignore_ratio && scalar(@F)/scalar(@E) > 9;
+
+  my $ec = &word_count($e);
+  my $fc = &word_count($f);
+  next if $ec > $max;
+  next if $fc > $max;
+  next if $ec < $min;
+  next if $fc < $min;
+  next if !$ignore_ratio && $ec/$fc > 9;
+  next if !$ignore_ratio && $fc/$ec > 9;
   # Skip this segment if any factor is longer than $max_word_length
   my $max_word_length_plus_one = $max_word_length + 1;
   next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
@@ -126,7 +129,6 @@ while(my $f = <F>) {
     if $f =~ /[ \|]\|/;
   die "There is a blank factor in $corpus.$l2 on line $innr: $e"
     if $e =~ /[ \|]\|/;
-    
   
   $outnr++;
   print FO $f."\n";
@@ -146,3 +148,15 @@ my $e = <E>;
 die "$corpus.$l2 is too long!" if defined $e;
 
 print STDERR "Input sentences: $innr  Output sentences:  $outnr\n";
+
+sub word_count {
+  my ($line) = @_;
+  if ($ignore_xml) {
+    $line =~ s/<\S[^>]*\S>//g;
+    $line =~ s/\s+/ /g;
+    $line =~ s/^ //g;
+    $line =~ s/ $//g;    
+  }
+  my @w = split(/ /,$line);
+  return scalar @w;
+}
author	phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>	2010-10-21 13:49:27 +0400
committer	phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>	2010-10-21 13:49:27 +0400
commit	85a5a13e4c722fccc28394d06da1ea0194bf7ab6 (patch)
tree	97876a77307d0a21bbf0be6ea913c1a9f067a225 /scripts/training/clean-corpus-n.perl
parent	88eaf49c5e051f7c1202d01aa2136c811c17e401 (diff)