From 8f504a1d9ba80466dd5b24424a0a3b06ff4cfbc3 Mon Sep 17 00:00:00 2001 From: bojar Date: Fri, 11 Aug 2006 21:26:24 +0000 Subject: a handy script to count words that passed through the decoder unchanged (mostly because they're unknown); can exclude numbers and punctuation git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@667 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/analysis/nontranslated_words.pl | 69 +++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100755 scripts/analysis/nontranslated_words.pl (limited to 'scripts/analysis') diff --git a/scripts/analysis/nontranslated_words.pl b/scripts/analysis/nontranslated_words.pl new file mode 100755 index 000000000..3420f996d --- /dev/null +++ b/scripts/analysis/nontranslated_words.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl +# + +use strict; +use warnings; +use Getopt::Long; + +my $ignore_numbers = 0; +my $ignore_punct = 0; +my $usage = 0; + +GetOptions( + "help" => \$usage, + "ignore-numbers" => \$ignore_numbers, + "ignore-punct" => \$ignore_punct, +) or exit 1; +my $src = shift; +my $tgt = shift; + +if ($usage || !defined $src || !defined $tgt) { + print STDERR "nontranslated_words.pl srcfile hypothesisfile +...counts the number of words that are equal in src and hyp. These are +typically unknown words. +Options: + --ignore-numbers ... numbers usually do not get translated, but do + not count them (it is not an error) + --ignore-punct ... same for punct, do not include it in the count +"; + exit 1; +} + +open SRC, $src or die "Can't read $src"; +open TGT, $tgt or die "Can't read $tgt"; +binmode(SRC, ":utf8"); +binmode(TGT, ":utf8"); + +my $nr=0; +my $outtoks = 0; +my $intoks = 0; +my $copiedtoks = 0; +while () { + $nr++; + chomp; + s/^\s+|\s+$//g; + my @src = split /\s+/; + my %src = map {($_,1)} @src; + $intoks += scalar @src; + my $t = ; + die "$tgt too short!" if !defined $t; + $t =~ s/^\s+|\s+$//g; + foreach my $outtok (split /\s+/, $t) { + $outtoks++; + next if !defined $src{$outtok}; # this word did not appear in input, we generated it + next if $ignore_numbers && $outtok =~ /^-?[0-9]*([.,][0-9]+)?$/; + next if $ignore_punct && $outtok =~ /^[[:punct:]]+$/; + $copiedtoks++; + } +} +close SRC; +close TGT; + +print "Sentences:\t$nr +Source tokens:\t$intoks +Output tokens:\t$outtoks +Output tokens appearing also in input sent:\t$copiedtoks\t" + .sprintf("%.2f %%", $copiedtoks/$outtoks*100) + ."\t".($ignore_punct?"ignoring":"including")." punctuation" + ."\t".($ignore_numbers?"ignoring":"including")." numbers" + ."\n"; -- cgit v1.2.3