scripts/analysis/suspicious_tokenization.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

#!/usr/bin/perl
# Collects and prints all n-grams that appear in the given corpus both
# tokenized as well as untokenized.
# Ondrej Bojar

use strict;
use warnings;

use Getopt::Long;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

my $usage = 0;
my $lowercase = 0;
my $n = 2;
GetOptions(
  "n=i" => \$n,  # the n-grams to search for (default: bigrams)
  "lc|lowercase" => \$lowercase, # ignore case
  "h|help|usage" => \$usage, # show info
) or exit 1;

my $nl = 0;
my $ngrams;
my $words;
while (<>) {
  $nl++;
  print STDERR "." if $nl % 100000 == 0;
  print STDERR "($nl)" if $nl % 500000 == 0;
  chomp;
  $_ = lc($_) if $lowercase;
  my @words = split /\s+/;
  foreach my $w (@words) {
    $words->{$w}++;
  }
  $ngrams = ngrams($n, \@words, $ngrams); # add ngram counts from this
}
print STDERR "Done.\n";

# Find suspicious
my $report;
foreach my $ngr (keys %$ngrams) {
  my $w = $ngr;
  $w =~ s/ //g;
  my $untokcnt = $words->{$w};
  next if ! $untokcnt; # never seen untokenized
  my $tokcnt = $ngrams->{$ngr};
  $report->{$ngr}->{"tok"} = $tokcnt;
  $report->{$ngr}->{"untok"} = $untokcnt;
  $report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt);
}

# Report
foreach my $ngr (sort {$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}}
                  keys %$report) {
  print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n";
}

sub ngrams {
  my $n = shift;
  my @words = @{shift()};
  my $out = shift;
  if ($n == 1) {
    foreach my $w (@words) {
      $out->{$w}++;
    }
  } else {
    while ($#words >= $n-1) {
      $out->{join(" ", @words[0..$n-1])}++;
      shift @words;
    }
  }
  return $out;
}