#!/usr/bin/perl -w use strict; use utf8; use Encode; binmode STDOUT, ":utf8"; binmode STDERR, ":utf8"; ################################# # History: # # version 12 # * Text normalization changes: # * convert entity references (only the entities declared in the DTD) # * now uses unicode categories # * tokenize punctuation unless followed AND preceded by digits # * tokenize symbols # * UTF-8 handling: # * files are now read using utf8 mode # * Added the '-e' command-line option to enclose non-ASCII characters between spaces # # version 11b -- text normalization modified: # * take out the join digit line because it joins digits # when it shouldn't have # $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits # # version 11a -- corrected output of individual n-gram precision values # # version 11 -- bug fixes: # * make filehandle operate in binary mode to prevent Perl from operating # (by default in Red Hat 9) in UTF-8 # * fix failure on joining digits # version 10 -- updated output to include more details of n-gram scoring. # Defaults to generate both NIST and BLEU scores. Use -b for BLEU # only, use -n for NIST only # # version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4 # being the max, regardless what was entered on the command line.) # # version 09c -- bug fix (During the calculation of ngram information, # each ngram was being counted only once for each segment. This has # been fixed so that each ngram is counted correctly in each segment.) # # version 09b -- text normalization modified: # * option flag added to preserve upper case # * non-ASCII characters left in place. # # version 09a -- text normalization modified: # * " and & converted to "" and &, respectively # * non-ASCII characters kept together (bug fix) # # version 09 -- modified to accommodate sgml tag and attribute # names revised to conform to default SGML conventions. # # version 08 -- modifies the NIST metric in accordance with the # findings on the 2001 Chinese-English dry run corpus. Also # incorporates the BLEU metric as an option and supports the # output of ngram detail. # # version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI # Keep strings of non-ASCII characters together as one word # (rather than splitting them into one-character words). # Change length penalty so that translations that are longer than # the average reference translation are not penalized. # # version 06 # Prevent divide-by-zero when a segment has no evaluation N-grams. # Correct segment index for level 3 debug output. # # version 05 # improve diagnostic error messages # # version 04 # tag segments # # version 03 # add detailed output option (intermediate document and segment scores) # # version 02 # accommodation of modified sgml tags and attributes # # version 01 # same as bleu version 15, but modified to provide formal score output. # # original IBM version # Author: Kishore Papineni # Date: 06/10/2001 ################################# ###### # Intro my ($date, $time) = date_time_stamp(); print "MT evaluation scorer began on $date at $time\n"; print "command line: ", $0, " ", join(" ", @ARGV), "\n"; my $usage = "\n\nUsage: $0 [-h] -r -s -t \n\n". "Description: This Perl script evaluates MT system performance.\n". "\n". "Required arguments:\n". " -r is a file containing the reference translations for\n". " the documents to be evaluated.\n". " -s is a file containing the source documents for which\n". " translations are to be evaluated\n". " -t is a file containing the translations to be evaluated\n". "\n". "Optional arguments:\n". " -c preserves upper-case alphabetic characters\n". " -b generate BLEU scores only\n". " -n generate NIST scores only\n". " -d detailed output flag used in conjunction with \"-b\" or \"-n\" flags:\n". " 0 (default) for system-level score only\n". " 1 to include document-level scores\n". " 2 to include segment-level scores\n". " 3 to include ngram-level scores\n". " -e enclose non-ASCII characters between spaces\n". " -h prints this help message to STDOUT\n". "\n"; use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e); use Getopt::Std; getopts ('r:s:t:d:hbncx:e'); die $usage if defined($opt_h); die "Error in command line: ref_file not defined$usage" unless defined $opt_r; die "Error in command line: src_file not defined$usage" unless defined $opt_s; die "Error in command line: tst_file not defined$usage" unless defined $opt_t; my $max_Ngram = 9; my $detail = defined $opt_d ? $opt_d : 0; my $preserve_case = defined $opt_c ? 1 : 0; my $split_non_ASCII = defined $opt_e ? 1 : 0; my $METHOD = "BOTH"; if (defined $opt_b) { $METHOD = "BLEU"; } if (defined $opt_n) { $METHOD = "NIST"; } my $method; my ($ref_file) = $opt_r; my ($src_file) = $opt_s; my ($tst_file) = $opt_t; ###### # Global variables my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters my (%tst_data, %ref_data); # the data -- with structure: {system}{document}[segments] my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets my %eval_docs; # document information for the evaluation data set my %ngram_info; # the information obtained from (the last word in) the ngram ###### # Get source document ID's ($src_id) = get_source_info ($src_file); ###### # Get reference translations ($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file); compute_ngram_info (); ###### # Get translations to evaluate ($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file); ###### # Check data for completeness and correctness check_MT_data (); ###### # my %NISTmt = (); my %BLEUmt = (); ###### # Evaluate print " Evaluation of $src_lang-to-$tgt_lang translation using:\n"; my $cum_seg = 0; foreach my $doc (sort keys %eval_docs) { $cum_seg += @{$eval_docs{$doc}{SEGS}}; } print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n"; print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n"; print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n"; foreach my $sys (sort @tst_sys) { for (my $n=1; $n<=$max_Ngram; $n++) { $NISTmt{$n}{$sys}{cum} = 0; $NISTmt{$n}{$sys}{ind} = 0; $BLEUmt{$n}{$sys}{cum} = 0; $BLEUmt{$n}{$sys}{ind} = 0; } if (($METHOD eq "BOTH") || ($METHOD eq "NIST")) { $method="NIST"; score_system ($sys, %NISTmt); } if (($METHOD eq "BOTH") || ($METHOD eq "BLEU")) { $method="BLEU"; score_system ($sys, %BLEUmt); } } ###### printout_report (); ($date, $time) = date_time_stamp(); print "MT evaluation scorer ended on $date at $time\n"; exit 0; ################################# sub get_source_info { my ($file) = @_; my ($name, $id, $src, $doc); my ($data, $tag, $span); #read data from file open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; binmode FILE, ":utf8"; $data .= $_ while ; close (FILE); #get source set info die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n" unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data); die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" ." with $name in previous input data ('$src_lang')\n\n" unless (not defined $src_lang or $src eq $src_lang); $src_lang = $src; #get doc info -- ID and # of segs $data = $span; while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) { die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag); die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n" if defined $eval_docs{$doc}; $span =~ s/[\s\n\r]+/ /g; # concatenate records my $jseg=0, my $seg_data = $span; while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) { ($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span); } die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" if $jseg == 0; } die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n" unless keys %eval_docs > 0; return $id; } ################################# sub get_MT_data { my ($docs, $set_tag, $file) = @_; my ($name, $id, $src, $tgt, $sys, $doc); my ($tag, $span, $data); #read data from file open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; binmode FILE, ":utf8"; $data .= $_ while ; close (FILE); #get tag info while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) { die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" ." with $name of source ('$src_lang')\n\n" unless $src eq $src_lang; die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag); die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n" ." with $name of the evaluation ('$tgt_lang')\n\n" unless (not defined $tgt_lang or $tgt eq $tgt_lang); $tgt_lang = $tgt; my $mtdata = $span; while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) { die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag); die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag); die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n" ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n" unless (not defined $docs->{$sys}{$doc}); $span =~ s/[\s\n\r]+/ /g; # concatenate records my $jseg=0, my $seg_data = $span; while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) { ($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span); } die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" if $jseg == 0; $docs->{$sys}{$doc}{FILE} = $file; } } return $id; } ################################# sub check_MT_data { @tst_sys = sort keys %tst_data; @ref_sys = sort keys %ref_data; #every evaluation document must be represented for every system and every reference foreach my $doc (sort keys %eval_docs) { my $nseg_source = @{$eval_docs{$doc}{SEGS}}; foreach my $sys (@tst_sys) { die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc}; my $nseg = @{$tst_data{$sys}{$doc}{SEGS}}; die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" ." document '$doc' for system '$sys' contains $nseg segments, while\n" ." the source document contains $nseg_source segments.\n\n" unless $nseg == $nseg_source; } foreach my $sys (@ref_sys) { die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc}; my $nseg = @{$ref_data{$sys}{$doc}{SEGS}}; die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" ." document '$doc' for system '$sys' contains $nseg segments, while\n" ." the source document contains $nseg_source segments.\n\n" unless $nseg == $nseg_source; } } } ################################# sub compute_ngram_info { my ($ref, $doc, $seg); my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram); my (%ngram_count, @tot_ngrams); foreach $ref (keys %ref_data) { foreach $doc (keys %{$ref_data{$ref}}) { foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) { @wrds = split /\s+/, $seg; $tot_wrds += @wrds; %ngrams = %{Words2Ngrams (@wrds)}; foreach $ngram (keys %ngrams) { $ngram_count{$ngram} += $ngrams{$ngram}; } } } } foreach $ngram (keys %ngram_count) { @wrds = split / /, $ngram; pop @wrds, $mgram = join " ", @wrds; $ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2; if (defined $opt_x and $opt_x eq "ngram info") { @wrds = split / /, $ngram; printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram}, $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram; } } } ################################# sub score_system { my ($sys, $ref, $doc, %SCOREmt); ($sys, %SCOREmt) = @_; my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); $cum_ref_length = 0; for (my $j=1; $j<=$max_Ngram; $j++) { $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; } foreach $doc (sort keys %eval_docs) { ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc); #output document summary score if (($detail >= 1 ) && ($METHOD eq "NIST")) { my %DOCmt = (); printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; } if (($detail >= 1 ) && ($METHOD eq "BLEU")) { my %DOCmt = (); printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; } $cum_ref_length += $shortest_ref_length; for (my $j=1; $j<=$max_Ngram; $j++) { $cum_match[$j] += $match_cnt->[$j]; $cum_tst_cnt[$j] += $tst_cnt->[$j]; $cum_ref_cnt[$j] += $ref_cnt->[$j]; $cum_tst_info[$j] += $tst_info->[$j]; $cum_ref_info[$j] += $ref_info->[$j]; printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j], $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j] if (defined $opt_x and $opt_x eq "document info"); } } #x #output system summary score #x printf "$method score = %.4f for system \"$sys\"\n", #x $method eq "BLEU" ? bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) : #x nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt); if ($method eq "BLEU") { bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, %SCOREmt); } if ($method eq "NIST") { nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, %SCOREmt); } } ################################# sub score_document { my ($sys, $ref, $doc); ($sys, $doc) = @_; my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); $cum_ref_length = 0; for (my $j=1; $j<=$max_Ngram; $j++) { $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; } #score each segment for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) { my @ref_segments = (); foreach $ref (@ref_sys) { push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg]; printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg] if $detail >= 3; } printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg] if $detail >= 3; ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments); #output segment summary score #x printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n", #x $method eq "BLEU" ? bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) : #x nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info), #x $jseg+1, $tst_cnt->[1] #x if $detail >= 2; if (($detail >=2) && ($METHOD eq "BLEU")) { my %DOCmt = (); printf " $method score using 4-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n", bleu_score($shortest_ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt), $jseg+1, $tst_cnt->[1]; } if (($detail >=2) && ($METHOD eq "NIST")) { my %DOCmt = (); printf " $method score using 5-grams = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n", nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt), $jseg+1, $tst_cnt->[1]; } $cum_ref_length += $shortest_ref_length; for (my $j=1; $j<=$max_Ngram; $j++) { $cum_match[$j] += $match_cnt->[$j]; $cum_tst_cnt[$j] += $tst_cnt->[$j]; $cum_ref_cnt[$j] += $ref_cnt->[$j]; $cum_tst_info[$j] += $tst_info->[$j]; $cum_ref_info[$j] += $ref_info->[$j]; } } return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]); } ################################# sub score_segment { my ($tst_seg, @ref_segs) = @_; my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info); my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info); my ($ngram); my (@nwrds_ref); my $shortest_ref_length; for (my $j=1; $j<= $max_Ngram; $j++) { $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0; } # get the ngram counts for the test segment @tst_wrds = split /\s+/, $tst_seg; %tst_ngrams = %{Words2Ngrams (@tst_wrds)}; for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0; } # get the ngram counts for the reference segments foreach $ref_seg (@ref_segs) { @ref_wrds = split /\s+/, $ref_seg; %ref_ngrams = %{Words2Ngrams (@ref_wrds)}; foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences my @wrds = split / /, $ngram; $ref_info[@wrds] += $ngram_info{$ngram}; $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram}; } for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0; } $shortest_ref_length = scalar @ref_wrds # find the shortest reference segment if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length; } # accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams foreach $ngram (keys %tst_ngrams) { next unless defined $ref_ngrams_max{$ngram}; my @wrds = split / /, $ngram; $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram if $detail >= 3; } return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]); } ################################# sub bleu_score { my ($shortest_ref_length, $matching_ngrams, $tst_ngrams, $sys, %SCOREmt) = @_; my $score = 0; my $iscore = 0; my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]); print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n"; for (my $j=1; $j<=$max_Ngram; $j++) { if ($matching_ngrams->[$j] == 0) { $SCOREmt{$j}{$sys}{cum}=0; } else { # Cumulative N-Gram score $score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]); $SCOREmt{$j}{$sys}{cum} = exp($score/$j + $len_score); # Individual N-Gram score $iscore = log ($matching_ngrams->[$j]/$tst_ngrams->[$j]); $SCOREmt{$j}{$sys}{ind} = exp($iscore); } } return $SCOREmt{4}{$sys}{cum}; } ################################# sub nist_score { my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, %SCOREmt) = @_; my $score = 0; my $iscore = 0; for (my $n=1; $n<=$max_Ngram; $n++) { $score += $tst_info->[$n]/max($tst_ngrams->[$n],1); $SCOREmt{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1); $SCOREmt{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); } return $SCOREmt{5}{$sys}{cum}; } ################################# sub Words2Ngrams { #convert a string of words to an Ngram count hash my %count = (); for (; @_; shift) { my ($j, $ngram, $word); for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) { $ngram .= defined $ngram ? " $word" : $word; $count{$ngram}++; } } return {%count}; } ################################# sub NormalizeText { my ($norm_text) = @_; $norm_text =~ s///g; # strip "skipped" tags $norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines $norm_text =~ s/\p{Zl}/ /g; # join lines # replace entities $norm_text =~ s/"/\"/g; # quote to " $norm_text =~ s/&/&/g; # ampersand to & $norm_text =~ s/<//g; # greater-than to > $norm_text =~ s/'/\'/g; # apostrophe to ' $norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed $norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII ); # punctuation: tokenize any punctuation unless followed AND preceded by a digit $norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g; $norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g; $norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols $norm_text =~ s/\p{Z}+/ /g; # one space only between words $norm_text =~ s/^\p{Z}+//; # no leading space $norm_text =~ s/\p{Z}+$//; # no trailing space return $norm_text; } ################################# sub nist_length_penalty { my ($ratio) = @_; return 1 if $ratio >= 1; return 0 if $ratio <= 0; my $ratio_x = 1.5; my $score_x = 0.5; my $beta = -log($score_x)/log($ratio_x)/log($ratio_x); return exp (-$beta*log($ratio)*log($ratio)); } ################################# sub date_time_stamp { my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); my ($date, $time); $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; return ($date, $time); } ################################# sub extract_sgml_tag_and_span { my ($name, $data) = @_; ($data =~ m|<$name\s*([^>]*)>(.*?)(.*)|si) ? ($1, $2, $3) : (); } ################################# sub extract_sgml_tag_attribute { my ($name, $data) = @_; ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); } ################################# sub max { my ($max, $next); return unless defined ($max=pop); while (defined ($next=pop)) { $max = $next if $next > $max; } return $max; } ################################# sub min { my ($min, $next); return unless defined ($min=pop); while (defined ($next=pop)) { $min = $next if $next < $min; } return $min; } ################################# sub printout_report { if ( $METHOD eq "BOTH" ) { foreach my $sys (sort @tst_sys) { printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum}; } } elsif ($METHOD eq "NIST" ) { foreach my $sys (sort @tst_sys) { printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum}; } } elsif ($METHOD eq "BLEU" ) { foreach my $sys (sort @tst_sys) { printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum}; } } printf "\n# ------------------------------------------------------------------------\n\n"; printf "Individual N-gram scoring\n"; printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) { foreach my $sys (sort @tst_sys) { printf " NIST:"; for (my $i=1; $i<=$max_Ngram; $i++) { printf " %2.4f ",$NISTmt{$i}{$sys}{ind} } printf " \"$sys\"\n"; } printf "\n"; } if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) { foreach my $sys (sort @tst_sys) { printf " BLEU:"; for (my $i=1; $i<=$max_Ngram; $i++) { printf " %2.4f ",$BLEUmt{$i}{$sys}{ind} } printf " \"$sys\"\n"; } } printf "\n# ------------------------------------------------------------------------\n"; printf "Cumulative N-gram scoring\n"; printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) { foreach my $sys (sort @tst_sys) { printf " NIST:"; for (my $i=1; $i<=$max_Ngram; $i++) { printf " %2.4f ",$NISTmt{$i}{$sys}{cum} } printf " \"$sys\"\n"; } } printf "\n"; if (( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU")) { foreach my $sys (sort @tst_sys) { printf " BLEU:"; for (my $i=1; $i<=$max_Ngram; $i++) { printf " %2.4f ",$BLEUmt{$i}{$sys}{cum} } printf " \"$sys\"\n"; } } }