Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2010-02-03 17:23:06 +0300
committerbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2010-02-03 17:23:06 +0300
commit594e5e8acd45e3bc52d3f4220ba72190918c2461 (patch)
tree1db08ffc21cbcfcce750a4b2466555fbd0f630af /scripts/analysis
parent117f5ef329bbaa450eb56b7cd4341581064aed0a (diff)
adding a handy script for suspicious tokenization
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2845 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-xscripts/analysis/suspicious_tokenization.pl75
1 files changed, 75 insertions, 0 deletions
diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl
new file mode 100755
index 000000000..f7ca3c60d
--- /dev/null
+++ b/scripts/analysis/suspicious_tokenization.pl
@@ -0,0 +1,75 @@
+#!/usr/bin/perl
+# Collects and prints all n-grams that appear in the given corpus both
+# tokenized as well as untokenized.
+# Ondrej Bojar
+
+use strict;
+use warnings;
+
+use Getopt::Long;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+my $usage = 0;
+my $lowercase = 0;
+my $n = 2;
+GetOptions(
+ "n=i" => \$n, # the n-grams to search for (default: bigrams)
+ "lc|lowercase" => \$lowercase, # ignore case
+ "h|help|usage" => \$usage, # show info
+) or exit 1;
+
+my $nl = 0;
+my $ngrams;
+my $words;
+while (<>) {
+ $nl++;
+ print STDERR "." if $nl % 100000 == 0;
+ print STDERR "($nl)" if $nl % 500000 == 0;
+ chomp;
+ $_ = lc($_) if $lowercase;
+ my @words = split /\s+/;
+ foreach my $w (@words) {
+ $words->{$w}++;
+ }
+ $ngrams = ngrams($n, \@words, $ngrams); # add ngram counts from this
+}
+print STDERR "Done.\n";
+
+# Find suspicious
+my $report;
+foreach my $ngr (keys %$ngrams) {
+ my $w = $ngr;
+ $w =~ s/ //g;
+ my $untokcnt = $words->{$w};
+ next if ! $untokcnt; # never seen untokenized
+ my $tokcnt = $ngrams->{$ngr};
+ $report->{$ngr}->{"tok"} = $tokcnt;
+ $report->{$ngr}->{"untok"} = $untokcnt;
+ $report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt);
+}
+
+# Report
+foreach my $ngr (sort {$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}}
+ keys %$report) {
+ print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n";
+}
+
+sub ngrams {
+ my $n = shift;
+ my @words = @{shift()};
+ my $out = shift;
+ if ($n == 1) {
+ foreach my $w (@words) {
+ $out->{$w}++;
+ }
+ } else {
+ while ($#words >= $n-1) {
+ $out->{join(" ", @words[0..$n-1])}++;
+ shift @words;
+ }
+ }
+ return $out;
+}