diff options
author | bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> | 2010-02-03 19:37:08 +0300 |
---|---|---|
committer | bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> | 2010-02-03 19:37:08 +0300 |
commit | ff05e5a1b5c8dfacdcd2563533a1eba259d7b572 (patch) | |
tree | b4213a72fac6b27abd816737a6379f6c1a15b8b2 /scripts/analysis | |
parent | 9b10946f108e00670e0a0e3f51242d10273d34a7 (diff) |
list frequent mismatched tokenizations first
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2852 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-x | scripts/analysis/suspicious_tokenization.pl | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl index f7ca3c60d..29e32d271 100755 --- a/scripts/analysis/suspicious_tokenization.pl +++ b/scripts/analysis/suspicious_tokenization.pl @@ -49,10 +49,14 @@ foreach my $ngr (keys %$ngrams) { $report->{$ngr}->{"tok"} = $tokcnt; $report->{$ngr}->{"untok"} = $untokcnt; $report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt); + $report->{$ngr}->{"sum"} = $untokcnt+$tokcnt; } # Report -foreach my $ngr (sort {$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}} +foreach my $ngr (sort { + $report->{$a}->{"diff"} <=> $report->{$b}->{"diff"} + || $report->{$b}->{"sum"} <=> $report->{$a}->{"sum"} + } keys %$report) { print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n"; } |