From ff05e5a1b5c8dfacdcd2563533a1eba259d7b572 Mon Sep 17 00:00:00 2001 From: bojar Date: Wed, 3 Feb 2010 16:37:08 +0000 Subject: list frequent mismatched tokenizations first git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2852 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/analysis/suspicious_tokenization.pl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'scripts/analysis') diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl index f7ca3c60d..29e32d271 100755 --- a/scripts/analysis/suspicious_tokenization.pl +++ b/scripts/analysis/suspicious_tokenization.pl @@ -49,10 +49,14 @@ foreach my $ngr (keys %$ngrams) { $report->{$ngr}->{"tok"} = $tokcnt; $report->{$ngr}->{"untok"} = $untokcnt; $report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt); + $report->{$ngr}->{"sum"} = $untokcnt+$tokcnt; } # Report -foreach my $ngr (sort {$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}} +foreach my $ngr (sort { + $report->{$a}->{"diff"} <=> $report->{$b}->{"diff"} + || $report->{$b}->{"sum"} <=> $report->{$a}->{"sum"} + } keys %$report) { print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n"; } -- cgit v1.2.3