adding a handy script for suspicious tokenization

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2845 1f5c12ca-751b-0410-a591-d2e778427230
author: bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> 2010-02-03 17:23:06 +0300
committer: bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> 2010-02-03 17:23:06 +0300
commit: 594e5e8acd45e3bc52d3f4220ba72190918c2461 (patch)
tree: 1db08ffc21cbcfcce750a4b2466555fbd0f630af /scripts/analysis
parent: 117f5ef329bbaa450eb56b7cd4341581064aed0a (diff)
1 files changed, 75 insertions, 0 deletions
diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl
new file mode 100755
index 000000000..f7ca3c60d
--- /dev/null
+++ b/scripts/analysis/suspicious_tokenization.pl
@@ -0,0 +1,75 @@
+#!/usr/bin/perl
+# Collects and prints all n-grams that appear in the given corpus both
+# tokenized as well as untokenized.
+# Ondrej Bojar
+
+use strict;
+use warnings;
+
+use Getopt::Long;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+my $usage = 0;
+my $lowercase = 0;
+my $n = 2;
+GetOptions(
+  "n=i" => \$n,  # the n-grams to search for (default: bigrams)
+  "lc|lowercase" => \$lowercase, # ignore case
+  "h|help|usage" => \$usage, # show info
+) or exit 1;
+
+my $nl = 0;
+my $ngrams;
+my $words;
+while (<>) {
+  $nl++;
+  print STDERR "." if $nl % 100000 == 0;
+  print STDERR "($nl)" if $nl % 500000 == 0;
+  chomp;
+  $_ = lc($_) if $lowercase;
+  my @words = split /\s+/;
+  foreach my $w (@words) {
+    $words->{$w}++;
+  }
+  $ngrams = ngrams($n, \@words, $ngrams); # add ngram counts from this
+}
+print STDERR "Done.\n";
+
+# Find suspicious
+my $report;
+foreach my $ngr (keys %$ngrams) {
+  my $w = $ngr;
+  $w =~ s/ //g;
+  my $untokcnt = $words->{$w};
+  next if ! $untokcnt; # never seen untokenized
+  my $tokcnt = $ngrams->{$ngr};
+  $report->{$ngr}->{"tok"} = $tokcnt;
+  $report->{$ngr}->{"untok"} = $untokcnt;
+  $report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt);
+}
+
+# Report
+foreach my $ngr (sort {$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}}
+                  keys %$report) {
+  print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n";
+}
+
+sub ngrams {
+  my $n = shift;
+  my @words = @{shift()};
+  my $out = shift;
+  if ($n == 1) {
+    foreach my $w (@words) {
+      $out->{$w}++;
+    }
+  } else {
+    while ($#words >= $n-1) {
+      $out->{join(" ", @words[0..$n-1])}++;
+      shift @words;
+    }
+  }
+  return $out;
+}
author	bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>	2010-02-03 17:23:06 +0300
committer	bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>	2010-02-03 17:23:06 +0300
commit	594e5e8acd45e3bc52d3f4220ba72190918c2461 (patch)
tree	1db08ffc21cbcfcce750a4b2466555fbd0f630af /scripts/analysis
parent	117f5ef329bbaa450eb56b7cd4341581064aed0a (diff)