diff options
Diffstat (limited to 'scripts/analysis/smtgui/newsmtgui.cgi')
-rwxr-xr-x | scripts/analysis/smtgui/newsmtgui.cgi | 935 |
1 files changed, 0 insertions, 935 deletions
diff --git a/scripts/analysis/smtgui/newsmtgui.cgi b/scripts/analysis/smtgui/newsmtgui.cgi deleted file mode 100755 index 0e969a788..000000000 --- a/scripts/analysis/smtgui/newsmtgui.cgi +++ /dev/null @@ -1,935 +0,0 @@ -#!/usr/bin/perl -w - -use strict; - -use CGI; -use Corpus; #Evan's code -use Error qw(:try); - -#files with extensions other than these are interpreted as system translations; see the file 'file-descriptions', if it exists, for the comments that go with them -my %FILETYPE = ('e' => 'Reference Translation', - 'f' => 'Foreign Original', - 'ref.sgm' => 'Reference Translations', - 'e.sgm' => 'Reference Translations', - 'src.sgm' => 'Foreign Originals', - 'f.sgm' => 'Foreign Originals'); -my %DONTSCORE = ('f' => 1, 'f.sgm' => 1, 'src.sgm' => 1, - 'e' => 1, 'e.sgm' => 1, 'ref.sgm' => 1); -my @SHOW = ('f', 'e', 'comm'); -my %SHOW_COLOR = ('f' => "BLUE", - 'e' => "GREEN"); -my $FOREIGN = 'f'; - -#FILEDESC: textual descriptions associated with specific filenames; to be displayed on the single-corpus view -my %FILEDESC = (); &load_descriptions(); -my %factorIndices = loadFactorIndices('file-factors'); -my %MEMORY; &load_memory(); -my (@mBLEU,@NIST); -@mBLEU=`cat mbleu-memory.dat` if -e "mbleu-memory.dat"; chop(@mBLEU); -@NIST = `cat nist-memory.dat` if -e "nist-memory.dat"; chop(@NIST); -my %in; &ReadParse(); #parse arguments - -if (scalar(@ARGV) > 0 && $ARGV[0] eq 'bleu') { - $in{CORPUS} = $ARGV[1]; - $in{ACTION} = "VIEW_CORPUS"; -} - -my %MULTI_REF; -if ($in{CORPUS} && -e "$in{CORPUS}.ref.sgm") { - my $sysid; - open(REF,"$in{CORPUS}.ref.sgm"); - while(<REF>) { - $sysid = $1 if /<DOC.+sysid=\"([^\"]+)\"/; - if (/<seg[^>]*> *(\S.+\S) *<\/seg>/) { - push @{$MULTI_REF{$sysid}}, $1; - } - } - close(REF); -} - -if ($in{ACTION} eq '') { &show_corpora(); } -elsif ($in{ACTION} eq 'VIEW_CORPUS') { &view_corpus(); } -elsif ($in{ACTION} eq 'SCORE_FILE') { &score_file(); } -elsif ($in{ACTION} eq 'RESCORE_FILE') { &score_file(); } -elsif ($in{ACTION} eq 'COMPARE') { &compare(); } -else { &htmlhead("Unknown Action $in{ACTION}"); } -print "</BODY></HTML>\n"; - -###### SHOW CORPORA IN EVALUATION DIRECTORY - -sub show_corpora { - my %CORPUS = (); - - # find corpora in evaluation directory: see the factor-index file, which was already read in - foreach my $corpusName (keys %factorIndices) - { - $CORPUS{$corpusName} = 1; - } - - # list corpora - &htmlhead("All Corpora"); - print "<UL>\n"; - foreach (sort (keys %CORPUS)) { - print "<LI><A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($_)."\">Corpus $_</A>\n"; - } - print "</UL>\n"; -} - -###### SHOW INFORMATION FOR ONE CORPUS - -sub view_corpus { - my @TABLE; - &htmlhead("View Corpus $in{CORPUS}"); - - # find corpora in evaluation directory - my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-indices' => $factorIndices{$in{CORPUS}}); - - my ($sentence_count, $lineInfo); - if(-e "$in{CORPUS}.f") - { - $lineInfo = `wc -l $in{CORPUS}.f`; - $lineInfo =~ /^\s*(\d+)\s+/; - $sentence_count = 0 + $1; - } - else - { - $lineInfo = `wc -l $in{CORPUS}.e`; - $lineInfo =~ /^\s*(\d+)\s+/; - $sentence_count = 0 + $1; - } - - print "Corpus '$in{CORPUS}' consists of $sentence_count sentences\n"; - print "(<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&mBLEU=1>with mBLEU</A>)" if ((!defined($in{mBLEU})) && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f"); - print "<P>\n"; - print "<FORM ACTION=''>\n"; - print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=COMPARE>\n"; - print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; - print "<TABLE BORDER=1 CELLSPACING=0><TR> -<TD>Filename (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS}).">sort</A>)</TD> -<TD>Date (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=TIME>sort</A>)</TD>"; - if (-e "$in{CORPUS}.e") { - print "<TD>IBM BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=IBM>sort</A>)</TD>"; - } - if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm") { - print "<TD>NIST (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=NIST>sort</A>)</TD>"; - if (! -e "$in{CORPUS}.e") { - print "<TD>BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=BLEU>sort</A>)</TD>"; - } - } - if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f") { - print "<TD>mBLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=mBLEU>sort</A>)</TD>"; - } - print "<TD>Unknown Words</TD>"; #can't sort on; only applies to the input - print "<TD>Noun & adj WER-PWER</TD>"; #can't sort on; only applies to sysoutputs - print "<TD>Surface vs. lemma PWER</TD>"; #can't sort on; only applies to sysoutputs - print "<TD>Score (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=SCORE>sort</A>)</TD><TD>Actions</TD></TR>"; - - open(DIR,"ls $in{CORPUS}.*|"); - while(<DIR>) { - my $sort = ""; - chop; - my $sgm = 0; - if (/.sgm$/) - { - `grep '<seg' $_ | wc -l` =~ /^\s*(\d+)\s+/; - next unless $1 == $sentence_count; - $sgm = 1; - } - else - { - `wc -l $_` =~ /^\s*(\d+)\s+/; - next unless $1 == $sentence_count; - } - /^$in{CORPUS}.([^\/]+)$/; - my $file = $1; - # checkbox for compare - my $row = "<TR><TD><INPUT TYPE=CHECKBOX NAME=FILE_$file VALUE=1>"; - # README - if (-e "$in{CORPUS}.$file.README") { - my $readme = `cat $in{CORPUS}.$file.README`; - $readme =~ s/([\"\'])/\\\"/g; - $readme =~ s/[\n\r]/\\n/g; - $readme =~ s/\t/\\t/g; - $row .= "<A HREF='javascript:FieldInfo(\"$in{CORPUS}.$file\",\"$readme\")'>"; - } - # filename - $row .= "$in{CORPUS}.$file</A>"; - # description (hard-coded) - my @TRANSLATION_SENTENCE = `cat $in{CORPUS}.$file`; - chop(@TRANSLATION_SENTENCE); - - #count sentences that contain null words - my $null_count = 0; - foreach (@TRANSLATION_SENTENCE) - { - $null_count++ if /^NULL$/ || /^NONE$/; - } - if ($null_count > 0) { - $row .= "$null_count NULL "; - } - - $row .= " (".$FILETYPE{$file}.")" if defined($FILETYPE{$file}); - $row .= " (".$FILEDESC{$in{CORPUS}.".".$file}.")" if defined($FILEDESC{$in{CORPUS}.".".$file}); - $row .= " (".$FILEDESC{$file}.")" if defined($FILEDESC{$file}); - # filedate - my @STAT = stat("$in{CORPUS}.$file"); - my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($STAT[8]); #STAT[8] should be last modify time - my $time = sprintf("%04d-%02d-%02d %02d:%02d:%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec); - $row .= "</TD>\n<TD>".$time."</TD>\n"; - if (defined($in{SORT}) && $in{SORT} eq 'TIME') { $sort = $time; } - # IBM BLEU score - my $no_bleu =0; - if (!$sgm && -e "$in{CORPUS}.e") { - $row .= "<TD>"; - if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e" && $file !~ /^pt/) { - my ($score,$p1,$p2,$p3,$p4,$bp) = $corpus->calcBLEU($file, 'surf'); - $row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f", $score, $p1, $p2, $p3, $p4, $bp); - if (defined($in{SORT}) && $in{SORT} eq 'IBM') { $sort = $score; } - } - $row .= "</TD>\n"; - } - else { - $no_bleu=1; - } - # NIST score - if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm" - && !$DONTSCORE{$file}) { - $row .= "<TD>"; - print "$DONTSCORE{$file}+"; - my ($nist,$nist_bleu); - if ($file =~ /sgm$/) { - ($nist,$nist_bleu) = &get_nist_score("$in{CORPUS}.ref.sgm","$in{CORPUS}.src.sgm","$in{CORPUS}.$file"); - $row .= sprintf("<B>%.04f</B>",$nist); - if ($in{SORT} eq 'NIST') { $sort = $nist; } - } - $row .= "</TD>\n"; - if ($no_bleu) { - $row .= "<TD>"; - if ($file =~ /sgm$/) { - $row .= sprintf("<B>%.04f</B>",$nist_bleu); - if ($in{SORT} eq 'BLEU') { $sort = $nist_bleu; } - } - $row .= "</TD>\n"; - } - } - # multi-bleu - if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e") { - $row .= "<TD>"; - if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e") { - my ($score,$p1,$p2,$p3,$p4,$bp) = &get_multi_bleu_score("$in{CORPUS}.f","$in{CORPUS}.e","$in{CORPUS}.$file"); - $row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f",$score,$p1,$p2,$p3,$p4,$bp); - if ($in{SORT} eq 'mBLEU') { $sort = $score; } - } - $row .= "</TD>\n"; - } - - # misc stats - $row .= "<TD align=\"center\">"; - if($file eq 'f') #input - { - try - { - my ($unknownCount, $totalCount) = calc_unknown_words($corpus, 'surf'); - $row .= sprintf("%.4lf (%d / %d)", $unknownCount / $totalCount, $unknownCount, $totalCount); - } - catch Error::Simple with {$row .= "[system error]";}; - } - $row .= "</TD>\n<TD align=\"center\">"; - my ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER); - if($file ne 'e' && $file ne 'f' && $file !~ /^pt/) #system output - { - try - { - ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER) = calc_misc_stats($corpus, $file); - $row .= "WER = $nnAdjWER<br>PWER = $nnAdjPWER<br><b>diff = " . ($nnAdjWER - $nnAdjPWER) . "</b>"; - } - catch Error::Simple with {$row .= "[system error]";}; - } - $row .= "</TD>\n<TD align=\"center\">"; - if($file ne 'e' && $file ne 'f' && $file !~ /^pt/) #system output - { - if($surfPWER == -1) - { - $row .= "[system error]"; - } - else - { - my ($lemmaBLEU, $p1, $p2, $p3, $p4, $brevity) = $corpus->calcBLEU($file, 'lemma'); - $row .= sprintf("surface = %d<br>lemma = %d<br><b>lemma BLEU = %.04f</b> %.01f/%.01f/%.01f/%.01f *%.03f", - $surfPWER, $lemmaPWER, $lemmaBLEU, $p1, $p2, $p3, $p4, $brevity); - } - } - $row .= "</TD>\n"; - - # correct sentence score - my($correct,$wrong,$unknown); - $row .= "<TD>"; - if (!defined($DONTSCORE{$file}) && (scalar keys %MEMORY)) { - my ($correct,$just_syn,$just_sem,$wrong,$unknown) = - &get_score_from_memory("$in{CORPUS}.$FOREIGN", - "$in{CORPUS}.$file"); - $row .= "<B><FONT COLOR=GREEN>$correct</FONT></B>"; - $row .= "/<FONT COLOR=ORANGE>$just_syn</FONT>"; - $row .= "/<FONT COLOR=ORANGE>$just_sem</FONT>"; - $row .= "/<FONT COLOR=RED>$wrong</FONT> ($unknown)</TD>\n"; - if ($in{SORT} eq 'SCORE') { - $sort = sprintf("%03d %04d",$correct,$just_syn+$just_sem); - } - } - else - { - $row .= "</TD>\n"; - } - - # score / review links - $row .= "<TD>"; - $row .= "<A HREF=\"?ACTION=SCORE_FILE&VIEW=1&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">view</A>" if (!defined($DONTSCORE{$file})); - $row .= " <A HREF=\"?ACTION=SCORE_FILE&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">score</A>" if (!defined($DONTSCORE{$file}) && ($unknown || scalar keys %MEMORY == 0)); - $row .= " <A HREF=\"?ACTION=RESCORE_FILE&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">review</A>" if (!defined($DONTSCORE{$file}) && scalar keys %MEMORY); - $row .= "</TD></TR>\n"; - push @TABLE, "<!-- $sort -->\n$row"; - } - close(DIR); - foreach (reverse sort @TABLE) { print $_; } - print "</TABLE>\n"; - print "<INPUT TYPE=SUBMIT VALUE=\"Compare\">\n"; - print "<INPUT TYPE=CHECKBOX NAME=SURFACE VALUE=1 CHECKED> Compare all different sentences (instead of just differently <I>evaluated</I> sentences) <INPUT TYPE=CHECKBOX NAME=WITH_EVAL VALUE=1 CHECKED> with evaluation</FORM><P>\n"; - print "<P>The score is to be read as: <FONT COLOR=GREEN>correct</FONT>/<FONT COLOR=ORANGE>just-syn-correct</FONT>/<FONT COLOR=ORANGE>just-sem-correct</FONT>/<FONT COLOR=RED>wrong</FONT> (unscored)\n"; - print "<BR>IBM BLEU is to be read as: <B>metric</B> unigram/bigram/trigram/quadgram *brevity-penalty\n"; - print "<P><A HREF=\"newsmtgui.cgi?action=\">All corpora</A>\n"; -} - -###### SCORE TRANSLATIONS - -sub score_file { - if ($in{VIEW}) { - &htmlhead("View Translations"); - } - else { - &htmlhead("Score Translations"); - } - print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n"; - print "<FORM ACTION=\"\" METHOD=POST>\n"; - print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n"; - print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; - print "<INPUT TYPE=HIDDEN NAME=FILE VALUE=\"$in{FILE}\">\n"; - - # get sentences - my @SENTENCES; - if ($in{FILE} =~ /.sgm$/) { - @SENTENCES = `grep '<seg' $in{CORPUS}.$in{FILE}`; - for(my $i=0;$i<$#SENTENCES;$i++) { - $SENTENCES[$i] =~ s/^<seg[^>]+> *(\S.+\S) *<\/seg> *$/$1/; - } - } - else { - @SENTENCES = `cat $in{CORPUS}.$in{FILE}`; chop(@SENTENCES); - } - - my %REFERENCE; - foreach (@SHOW) { - if (-e "$in{CORPUS}.$_") { - @{$REFERENCE{$_}} = `cat $in{CORPUS}.$_`; chop(@{$REFERENCE{$_}}); - } - } - - # update memory - foreach (keys %in) { - next unless /^SYN_SCORE_(\d+)$/; - next unless $in{"SEM_SCORE_$1"}; - &store_in_memory($REFERENCE{$FOREIGN}[$1], - $SENTENCES[$1], - "syn_".$in{"SYN_SCORE_$1"}." sem_".$in{"SEM_SCORE_$1"}); - } - - # display sentences - for(my $i=0;$i<=$#SENTENCES;$i++) { - my $evaluation = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES[$i]); - next if ($in{ACTION} eq 'SCORE_FILE' && - ! $in{VIEW} && - $evaluation ne '' && $evaluation ne 'wrong'); - print "<P>Sentence ".($i+1).":<BR>\n"; - # color coding - &color_highlight_ngrams($i,&nist_normalize_text($SENTENCES[$i]),$REFERENCE{"e"}[$i]); - if (%MULTI_REF) { - foreach my $sysid (keys %MULTI_REF) { - print "<FONT COLOR=GREEN>".$MULTI_REF{$sysid}[$i]."</FONT> (Reference $sysid)<BR>\n"; - } - } - - # all sentences - print "$SENTENCES[$i] (System output)<BR>\n"; - foreach my $ref (@SHOW) { - if (-e "$in{CORPUS}.$ref") { - print "<FONT COLOR=$SHOW_COLOR{$ref}>".$REFERENCE{$ref}[$i]."</FONT> (".$FILETYPE{$ref}.")<BR>\n" if $REFERENCE{$ref}[$i]; - } - } - if (! $in{VIEW}) { - print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=correct"; - print " CHECKED" if ($evaluation =~ /syn_correct/); - print "> perfect English\n"; - print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=wrong"; - print " CHECKED" if ($evaluation =~ /syn_wrong/); - print "> imperfect English<BR>\n"; - print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=correct"; - print " CHECKED" if ($evaluation =~ /sem_correct/); - print "> correct meaning\n"; - print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=wrong"; - print " CHECKED" if ($evaluation =~ /sem_wrong/); - print "> incorrect meaning\n"; - } - } - if (! $in{VIEW}) { - print "<P><INPUT TYPE=SUBMIT VALUE=\"Add evaluation\">\n"; - print "</FORM>\n"; - } -} - -sub color_highlight_ngrams { - my($i,$sentence,$single_reference) = @_; - my @REF = (); - my %NGRAM = (); - if (%MULTI_REF) { - foreach my $sysid (keys %MULTI_REF) { - push @REF,&nist_normalize_text($MULTI_REF{$sysid}[$i]); - } - } - elsif ($single_reference) { - @REF = ($single_reference); - } - if (@REF) { - foreach my $ref (@REF) { - my @WORD = split(/\s+/,$ref); - for(my $n=1;$n<=4;$n++) { - for(my $w=0;$w<=$#WORD-($n-1);$w++) { - my $ngram = "$n: "; - for(my $j=0;$j<$n;$j++) { - $ngram .= $WORD[$w+$j]." "; - } - $NGRAM{$ngram}++; - } - } - } - $sentence =~ s/^\s+//; - $sentence =~ s/\s+/ /; - $sentence =~ s/\s+$//; - my @WORD = split(/\s+/,$sentence); - my @CORRECT; - for(my $w=0;$w<=$#WORD;$w++) { - $CORRECT[$w] = 0; - } - for(my $n=1;$n<=4;$n++) { - for(my $w=0;$w<=$#WORD-($n-1);$w++) { - my $ngram = "$n: "; - for(my $j=0;$j<$n;$j++) { - $ngram .= $WORD[$w+$j]." "; - } - next unless defined($NGRAM{$ngram}) && $NGRAM{$ngram}>0; - $NGRAM{$ngram}--; - for(my $j=0;$j<$n;$j++) { - $CORRECT[$w+$j] = $n; - } - } - } - my @COLOR; - $COLOR[0] = "#FF0000"; - $COLOR[1] = "#C000C0"; - $COLOR[2] = "#0000FF"; - $COLOR[3] = "#00C0C0"; - $COLOR[4] = "#00C000"; - for(my $w=0;$w<=$#WORD;$w++) { - print "<B><FONT COLOR=".$COLOR[$CORRECT[$w]].">$WORD[$w]<SUB>".$CORRECT[$w]."</SUB></FONT></B> "; - } - print "\n<BR>"; - } -} - -###### OTHER STATS - -#print (in some unspecified way) the offending exception of type Error::Simple -#arguments: the error object, a context string -#return: none -sub printError -{ - my ($err, $context) = @_; - warn "$context: " . $err->{'-text'} . " @ " . $err->{'-file'} . " (" .$err->{'-line'} . ")\n"; -} - -#compute number and percentage of unknown tokens for a given factor in foreign corpus -#arguments: corpus object ref, factor name -#return (unkwordCount, totalWordCount), or (-1, -1) if an error occurs -sub calc_unknown_words -{ - my ($corpus, $factorName) = @_; - try - { - my ($unknownCount, $totalCount) = $corpus->calcUnknownTokens($factorName); - return ($unknownCount, $totalCount); - } - catch Error::Simple with - { - my $err = shift; - printError($err, 'calc_unknown_words()'); - return (-1, -1); - }; -} - -#compute (if we have the necessary factors) info for: -#- diff btwn wer and pwer for NNs & ADJs -- if large, many reordering errors -#- diff btwn pwer for surface forms and pwer for lemmas -- if large, morphology errors -#arguments: corpus object, system name -#return (NN/ADJ (wer, pwer), surf pwer, lemma pwer), or (-1, -1, -1, -1) if an error occurs -sub calc_misc_stats -{ - my ($corpus, $sysname) = @_; - try - { - my ($nnAdjWER, $nnAdjPWER) = $corpus->calcNounAdjWER_PWERDiff($sysname); - my ($surfPWER, $lemmaPWER) = ($corpus->calcOverallPWER($sysname, 'surf'), $corpus->calcOverallPWER($sysname, 'lemma')); - return ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER); - } - catch Error::Simple with - { - my $err = shift; - printError($err, 'calc_misc_stats()'); - return (-1, -1, -1, -1); - }; -} - -###### NIST SCORE - -sub get_nist_score { - my($reference_file,$source_file,$translation_file) = @_; - my @STAT = stat($translation_file); - my $current_timestamp = $STAT[9]; - foreach (@NIST) { - my ($file,$time,$nist,$bleu) = split; - return ($nist,$bleu) - if ($file eq $translation_file && $current_timestamp == $time); - } - - my $nist_eval = `/home/pkoehn/statmt/bin/mteval-v10.pl -c -r $reference_file -s $source_file -t $translation_file`; - return (0,0) unless ($nist_eval =~ /NIST score = (\d+\.\d+) BLEU score = (\d+\.\d+)/i); - - open(NIST,">>nist-memory.dat"); - printf NIST "$translation_file $current_timestamp %f %f\n",$1,$2; - close(NIST); - return ($1,$2); -} - -sub nist_normalize_text { - my ($norm_text) = @_; - -# language-independent part: - $norm_text =~ s/<skipped>//g; # strip "skipped" tags - $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines - $norm_text =~ s/\n/ /g; # join lines - $norm_text =~ s/(\d)\s+(\d)/$1$2/g; #join digits - $norm_text =~ s/"/"/g; # convert SGML tag for quote to " - $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & - $norm_text =~ s/</</g; # convert SGML tag for less-than to > - $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to < - -# language-dependent part (assuming Western languages): - $norm_text = " $norm_text "; -# $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case; - $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation - $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit - $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit - $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit - $norm_text =~ s/\s+/ /g; # one space only between words - $norm_text =~ s/^\s+//; # no leading space - $norm_text =~ s/\s+$//; # no trailing space - - return $norm_text; -} - -###### BLEU SCORE - -sub get_multi_bleu_score { - my($foreign_file,$reference_file,$translation_file) = @_; - my @STAT = stat($translation_file); - my $current_timestamp = $STAT[9]; - foreach (@mBLEU) { - my ($file,$time,$score,$g1,$g2,$g3,$g4,$bp) = split; - if ($file eq $translation_file && $current_timestamp == $time) { - return ($score,$g1*100,$g2*100,$g3*100,$g4*100,$bp); - } - } - - # load reference translation from reference file - my @REFERENCE_SENTENCE = `cat $reference_file`; chop(@REFERENCE_SENTENCE); - my @TRANSLATION_SENTENCE = `cat $translation_file`; chop(@TRANSLATION_SENTENCE); - my %REF; - my @FOREIGN_SENTENCE = `cat $foreign_file`; chop(@FOREIGN_SENTENCE); - for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) { - push @{$REF{$FOREIGN_SENTENCE[$i]}},$REFERENCE_SENTENCE[$i]; - } - # load reference translation from translation memory - foreach my $memory (keys %MEMORY) { - next if $MEMORY{$memory} ne 'syn_correct sem_correct'; - my ($foreign,$english) = split(/ .o0O0o. /,$memory); - next unless defined($REF{$foreign}); - push @{$REF{$foreign}},$english; - } - my(@CORRECT,@TOTAL,$length_translation,$length_reference); - # compute bleu - for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) { - my %REF_NGRAM = (); - my @WORD = split(/ /,$TRANSLATION_SENTENCE[$i]); - my $length_translation_this_sentence = scalar(@WORD); - my ($closest_diff,$closest_length) = (9999,9999); - foreach my $reference (@{$REF{$FOREIGN_SENTENCE[$i]}}) { - my @WORD = split(/ /,$reference); - my $length = scalar(@WORD); - if (abs($length_translation_this_sentence-$length) < $closest_diff) { - $closest_diff = abs($length_translation_this_sentence-$length); - $closest_length = $length; - } - for(my $n=1;$n<=4;$n++) { - my %REF_NGRAM_N = (); - for(my $start=0;$start<=$#WORD-($n-1);$start++) { - my $ngram = "$n"; - for(my $w=0;$w<$n;$w++) { - $ngram .= " ".$WORD[$start+$w]; - } - $REF_NGRAM_N{$ngram}++; - } - foreach my $ngram (keys %REF_NGRAM_N) { - if (!defined($REF_NGRAM{$ngram}) || - $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { - $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; - } - } - } - } - $length_translation += $length_translation_this_sentence; - $length_reference += $closest_length; - for(my $n=1;$n<=4;$n++) { - my %T_NGRAM = (); - for(my $start=0;$start<=$#WORD-($n-1);$start++) { - my $ngram = "$n"; - for(my $w=0;$w<$n;$w++) { - $ngram .= " ".$WORD[$start+$w]; - } - $T_NGRAM{$ngram}++; - } - foreach my $ngram (keys %T_NGRAM) { - my $n = 0+$ngram; -# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n"; - $TOTAL[$n] += $T_NGRAM{$ngram}; - if (defined($REF_NGRAM{$ngram})) { - if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { - $CORRECT[$n] += $T_NGRAM{$ngram}; -# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n"; - } - else { - $CORRECT[$n] += $REF_NGRAM{$ngram}; -# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n"; - } - } - } - } - } - my $brevity_penalty = 1; - if ($length_translation<$length_reference) { - $brevity_penalty = exp(1-$length_reference/$length_translation); - } - my $bleu = $brevity_penalty * exp((my_log( $CORRECT[1]/$TOTAL[1] ) + - my_log( $CORRECT[2]/$TOTAL[2] ) + - my_log( $CORRECT[3]/$TOTAL[3] ) + - my_log( $CORRECT[4]/$TOTAL[4] ) ) / 4); - - open(BLEU,">>mbleu-memory.dat"); - @STAT = stat($translation_file); - printf BLEU "$translation_file $STAT[9] %f %f %f %f %f %f\n",$bleu,$CORRECT[1]/$TOTAL[1],$CORRECT[2]/$TOTAL[2],$CORRECT[3]/$TOTAL[3],$CORRECT[4]/$TOTAL[4],$brevity_penalty; - close(BLEU); - - return ($bleu, - 100*$CORRECT[1]/$TOTAL[1], - 100*$CORRECT[2]/$TOTAL[2], - 100*$CORRECT[3]/$TOTAL[3], - 100*$CORRECT[4]/$TOTAL[4], - $brevity_penalty); -} - -sub my_log { - return -9999999999 unless $_[0]; - return log($_[0]); -} - - -###### SCORE TRANSLATIONS - -################################ IN PROGRESS ############################### -sub compare2 -{ - &htmlhead("Compare Translations"); - print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n"; - print "<FORM ACTION=\"\" METHOD=POST>\n"; - print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n"; - print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; - my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-indices' => \%factorIndices); - $corpus->writeComparisonPage(\*STDOUT, /^.*$/); - print "</FORM>\n"; -} - -sub compare { - &htmlhead("Compare Translations"); - print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n"; - print "<FORM ACTION=\"\" METHOD=POST>\n"; - print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n"; - print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; - - # get sentences - my %SENTENCES; - my $sentence_count; - foreach (keys %in) { - if (/^FILE_(.+)$/) { - my $file = $1; - print "<INPUT TYPE=HIDDEN NAME=\"$file\" VALUE=1>\n"; - my @SENTENCES; - if ($file =~ /.sgm$/) { - @{$SENTENCES{$file}} = `grep '<seg' $in{CORPUS}.$file`; - for(my $i=0;$i<$#{$SENTENCES{$file}};$i++) { - $SENTENCES{$file}[$i] =~ s/^<seg[^>]+> *(\S.+\S) *<\/seg> *$/$1/; - } - } - else { - @{$SENTENCES{$file}} = `cat $in{CORPUS}.$1`; - chop(@{$SENTENCES{$file}}); - } - - $sentence_count = scalar @{$SENTENCES{$file}}; - } - } - my %REFERENCE; - foreach (@SHOW) { - if (-e "$in{CORPUS}.$_") { - @{$REFERENCE{$_}} = `cat $in{CORPUS}.$_`; chop(@{$REFERENCE{$_}}); - } - } - - # update memory - foreach (keys %in) { - next unless /^SYN_SCORE_(.+)_(\d+)$/; - next unless $in{"SEM_SCORE_$1_$2"}; - &store_in_memory($REFERENCE{$FOREIGN}[$2], - $SENTENCES{$1}[$2], - "syn_".$in{"SYN_SCORE_$1_$2"}." sem_".$in{"SEM_SCORE_$1_$2"}); - } - - # display sentences - for(my $i=0;$i<$sentence_count;$i++) - { - my $evaluation = ""; - my $show = 0; - my $surface = ""; - foreach my $file (keys %SENTENCES) - { - if ($in{SURFACE}) { - $SENTENCES{$file}[$i] =~ s/ *$//; - $surface = $SENTENCES{$file}[$i] if ($surface eq ''); - $show = 1 if ($SENTENCES{$file}[$i] ne $surface); - } - else { - my $this_ev = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES{$file}[$i]); - $this_ev = "syn_wrong sem_wrong" unless $this_ev; - $evaluation = $this_ev if ($evaluation eq ''); - $show = 1 if ($evaluation ne $this_ev); - } - } - next unless $show; - print "<HR>Sentence ".($i+1).":<BR>\n"; - foreach my $ref (@SHOW) { - if (-e "$in{CORPUS}.$ref") { - print "<FONT COLOR=$SHOW_COLOR{$ref}>".$REFERENCE{$ref}[$i]."</FONT> (".$FILETYPE{$ref}.")<BR>\n"; - } - } - foreach my $file (keys %SENTENCES) { - print "<B>$SENTENCES{$file}[$i]</B> ($file)<BR>\n"; - &color_highlight_ngrams($i,&nist_normalize_text($SENTENCES{$file}[$i]),$REFERENCE{"e"}[$i]); - if (0 && $in{WITH_EVAL}) { - $evaluation = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES{$file}[$i]); - print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$file"."_$i VALUE=correct"; - print " CHECKED" if ($evaluation =~ /syn_correct/); - print "> perfect English\n"; - print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$file"."_$i VALUE=wrong"; - print " CHECKED" if ($evaluation =~ /syn_wrong/); - print "> imperfect English<BR>\n"; - print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$file"."_$i VALUE=correct"; - print " CHECKED" if ($evaluation =~ /sem_correct/); - print "> correct meaning\n"; - print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$file"."_$i VALUE=wrong"; - print " CHECKED" if ($evaluation =~ /sem_wrong/); - print "> incorrect meaning<BR>\n"; - } - } - } - print "<P><INPUT TYPE=SUBMIT VALUE=\"Add evaluation\">\n"; - print "</FORM>\n"; -} - -###### MEMORY SUBS - -sub load_memory { - open(MEMORY,"evaluation-memory.dat") or return; - while(<MEMORY>) { - chop; - my($foreign,$translation,$evaluation) = split(/ \.o0O0o\. /); - $evaluation = 'syn_correct sem_correct' if ($evaluation eq 'correct'); - $MEMORY{"$foreign .o0O0o. $translation"} = $evaluation; - } - close(MEMORY); -} - -sub get_score_from_memory { - my($foreign_file,$translation_file) = @_; - my $unknown=0; - my $correct=0; - my $just_syn=0; - my $just_sem=0; - my $wrong=0; - my @FOREIGN = `cat $foreign_file`; chop(@FOREIGN); - my @TRANSLATION = `cat $translation_file`; chop(@TRANSLATION); - for(my $i=0;$i<=$#FOREIGN;$i++) { - if (my $evaluation = &get_from_memory($FOREIGN[$i],$TRANSLATION[$i])) { - if ($evaluation eq 'syn_correct sem_correct') { $correct++ } - elsif ($evaluation eq 'syn_correct sem_wrong') { $just_syn++ } - elsif ($evaluation eq 'syn_wrong sem_correct') { $just_sem++ } - elsif ($evaluation eq 'syn_wrong sem_wrong') { $wrong++ } - else { $unknown++; } - } - else { $unknown++; } - } - return($correct,$just_syn,$just_sem,$wrong,$unknown); -} - -sub store_in_memory { - my($foreign,$translation,$evaluation) = @_; - &trim(\$translation); - return if $MEMORY{"$foreign .o0O0o. $translation"} eq $evaluation; - $MEMORY{"$foreign .o0O0o. $translation"} = $evaluation; - open(MEMORY,">>evaluation-memory.dat") or die "store_in_memory(): couldn't open 'evaluation-memory.dat' for append\n"; - print MEMORY "$foreign .o0O0o. $translation .o0O0o. $evaluation\n"; - close(MEMORY); -} - -sub get_from_memory { - my($foreign,$translation) = @_; - &trim(\$translation); - return $MEMORY{"$foreign .o0O0o. $translation"}; -} - -sub trim { - my($translation) = @_; - $$translation =~ s/ +/ /g; - $$translation =~ s/^ +//; - $$translation =~ s/ +$//; -# $$translation =~ s/ +[\.]$//; -} - -sub load_descriptions { - open(FD,"file-descriptions") or die "load_descriptions(): couldn't open 'file-descriptions' for read\n"; - while(<FD>) { - my($file,$description) = split(/\s+/,$_,2); - $FILEDESC{$file} = $description; - } - close(FD); -} - -#read config file giving names of corpi and fill factor-index map -#arguments: filename to read -#return: hash of corpus names to hashrefs of factor names to indices -sub loadFactorIndices -{ - my $filename = shift; - my %data = (); - open(INFILE, "<$filename") or die "loadFactorIndices(): couldn't open '$filename' for read\n"; - while(my $line = <INFILE>) - { - if($line =~ /^\#/) {next;} #skip comment lines - my @tokens = split(/\s+/, $line); - my $corpusName = shift @tokens; - $data{$corpusName} = {}; - for(my $i = 0; $i < scalar(@tokens); $i++) - { - $data{$corpusName}->{$tokens[$i]} = $i; - } - } - close(INFILE); - return %data; -} - -###### SUBS - -sub htmlhead { - print <<"___ENDHTML"; -Content-type: text/html - -<HTML><HEAD> -<TITLE>MTEval: $_[0]</TITLE> -<SCRIPT LANGUAGE="JavaScript"> - -<!-- hide from old browsers - -function FieldInfo(field,description) { - popup = window.open("","popDialog","height=500,width=600,scrollbars=yes,resizable=yes"); - popup.document.write("<HTML><HEAD><TITLE>"+field+"</TITLE></HEAD><BODY BGCOLOR=#FFFFCC><CENTER><B>"+field+"</B><HR SIZE=2 NOSHADE></CENTER><PRE>"+description+"</PRE><CENTER><FORM><INPUT TYPE='BUTTON' VALUE='Okay' onClick='self.close()'></FORM><CENTER></BODY></HTML>"); - popup.focus(); - popup.document.close(); -} - -<!-- done hiding --> - -</SCRIPT> -</HEAD> -<BODY BGCOLOR=white> -<H2>Evaluation Tool for Machine Translation<BR>$_[0]</H2> -___ENDHTML -} - - -############################# parts of cgi-lib.pl - - -sub ReadParse { - my ($i, $key, $val); - - # Read in text - my $in; - if (&MethGet) { - $in = $ENV{'QUERY_STRING'}; - } elsif (&MethPost) { - read(STDIN,$in,$ENV{'CONTENT_LENGTH'}); - } - - my @in = split(/[&;]/,$in); - - foreach $i (0 .. $#in) { - # Convert plus's to spaces - $in[$i] =~ s/\+/ /g; - - # Split into key and value. - ($key, $val) = split(/=/,$in[$i],2); # splits on the first =. - - # Convert %XX from hex numbers to alphanumeric - $key =~ s/%(..)/pack("c",hex($1))/ge; - $val =~ s/%(..)/pack("c",hex($1))/ge; - - # Associate key and value - $in{$key} .= "\0" if (defined($in{$key})); # \0 is the multiple separator - $in{$key} .= $val; - - } - - return scalar(@in); -} - -sub MethGet { - return ($ENV{'REQUEST_METHOD'} eq "GET"); -} - -sub MethPost { - return ($ENV{'REQUEST_METHOD'} eq "POST"); -} |