Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2010-02-03 19:37:08 +0300
committerbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2010-02-03 19:37:08 +0300
commitff05e5a1b5c8dfacdcd2563533a1eba259d7b572 (patch)
treeb4213a72fac6b27abd816737a6379f6c1a15b8b2 /scripts/analysis
parent9b10946f108e00670e0a0e3f51242d10273d34a7 (diff)
list frequent mismatched tokenizations first
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2852 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-xscripts/analysis/suspicious_tokenization.pl6
1 files changed, 5 insertions, 1 deletions
diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl
index f7ca3c60d..29e32d271 100755
--- a/scripts/analysis/suspicious_tokenization.pl
+++ b/scripts/analysis/suspicious_tokenization.pl
@@ -49,10 +49,14 @@ foreach my $ngr (keys %$ngrams) {
$report->{$ngr}->{"tok"} = $tokcnt;
$report->{$ngr}->{"untok"} = $untokcnt;
$report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt);
+ $report->{$ngr}->{"sum"} = $untokcnt+$tokcnt;
}
# Report
-foreach my $ngr (sort {$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}}
+foreach my $ngr (sort {
+ $report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}
+ || $report->{$b}->{"sum"} <=> $report->{$a}->{"sum"}
+ }
keys %$report) {
print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n";
}