From 594e5e8acd45e3bc52d3f4220ba72190918c2461 Mon Sep 17 00:00:00 2001 From: bojar Date: Wed, 3 Feb 2010 14:23:06 +0000 Subject: adding a handy script for suspicious tokenization git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2845 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/analysis/suspicious_tokenization.pl | 75 +++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100755 scripts/analysis/suspicious_tokenization.pl (limited to 'scripts/analysis') diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl new file mode 100755 index 000000000..f7ca3c60d --- /dev/null +++ b/scripts/analysis/suspicious_tokenization.pl @@ -0,0 +1,75 @@ +#!/usr/bin/perl +# Collects and prints all n-grams that appear in the given corpus both +# tokenized as well as untokenized. +# Ondrej Bojar + +use strict; +use warnings; + +use Getopt::Long; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +binmode(STDERR, ":utf8"); + +my $usage = 0; +my $lowercase = 0; +my $n = 2; +GetOptions( + "n=i" => \$n, # the n-grams to search for (default: bigrams) + "lc|lowercase" => \$lowercase, # ignore case + "h|help|usage" => \$usage, # show info +) or exit 1; + +my $nl = 0; +my $ngrams; +my $words; +while (<>) { + $nl++; + print STDERR "." if $nl % 100000 == 0; + print STDERR "($nl)" if $nl % 500000 == 0; + chomp; + $_ = lc($_) if $lowercase; + my @words = split /\s+/; + foreach my $w (@words) { + $words->{$w}++; + } + $ngrams = ngrams($n, \@words, $ngrams); # add ngram counts from this +} +print STDERR "Done.\n"; + +# Find suspicious +my $report; +foreach my $ngr (keys %$ngrams) { + my $w = $ngr; + $w =~ s/ //g; + my $untokcnt = $words->{$w}; + next if ! $untokcnt; # never seen untokenized + my $tokcnt = $ngrams->{$ngr}; + $report->{$ngr}->{"tok"} = $tokcnt; + $report->{$ngr}->{"untok"} = $untokcnt; + $report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt); +} + +# Report +foreach my $ngr (sort {$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}} + keys %$report) { + print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n"; +} + +sub ngrams { + my $n = shift; + my @words = @{shift()}; + my $out = shift; + if ($n == 1) { + foreach my $w (@words) { + $out->{$w}++; + } + } else { + while ($#words >= $n-1) { + $out->{join(" ", @words[0..$n-1])}++; + shift @words; + } + } + return $out; +} -- cgit v1.2.3