#!/usr/bin/env perl use warnings; use strict; use Getopt::Long "GetOptions"; my $MAX_LENGTH = 4; my ($system,$system_alignment,$segmentation,$reference,$dir,$input,$corpus,$ttable,@FACTORED_TTABLE,$score_options,$hierarchical,$output_corpus,$alignment,$biconcor,$input_factors,$input_factor_names,$output_factor_names,$precision_by_coverage,$precision_by_coverage_factor,$coverage_dir,$search_graph); if (!&GetOptions('system=s' => \$system, # raw output from decoder 'system-alignment=s' => \$system_alignment, # word alignment of system output 'reference=s' => \$reference, # tokenized reference 'dir=s' => \$dir, # directory for storing results 'input-factors=i' => \$input_factors, # list of input factors 'input-factor-names=s' => \$input_factor_names, 'output-factor-names=s' => \$output_factor_names, 'precision-by-coverage' => \$precision_by_coverage, # added report for input words 'precision-by-coverage-factor=i' => \$precision_by_coverage_factor, # sub-reports 'input=s' => \$input, # tokenized input (as for decoder) 'segmentation=s' => \$segmentation, # system output with segmentation markup 'input-corpus=s' => \$corpus, # input side of parallel training corpus 'ttable=s' => \$ttable, # phrase translation table used for decoding 'factored-ttable=s' => \@FACTORED_TTABLE, # factored phrase translation table 'score-options=s' => \$score_options, # score options to detect p(e|f) score 'output-corpus=s' => \$output_corpus, # output side of parallel training corpus 'alignment-file=s' => \$alignment, # alignment of parallel corpus 'coverage=s' => \$coverage_dir, # already computed coverage, stored in this dir 'biconcor=s' => \$biconcor, # binary for bilingual concordancer 'search-graph=s' => \$search_graph, # visualization of search graph 'hierarchical' => \$hierarchical) || # hierarchical model? !defined($dir)) { die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-score-options SETTINGS] [-segmentation FILE] [-output-corpus FILE] [-alignment-file FILE] [-biconcor BIN]"); } `mkdir -p $dir`; # factor names if (defined($input_factor_names) && defined($output_factor_names)) { open(FACTOR,">$dir/factor-names") or die "Cannot open: $!"; print FACTOR $input_factor_names."\n"; print FACTOR $output_factor_names."\n"; close(FACTOR); } # compare system output against reference translation my(@SYSTEM,@REFERENCE); my (%PRECISION_CORRECT,%PRECISION_TOTAL, %RECALL_CORRECT,%RECALL_TOTAL); if (defined($system) || defined($reference)) { die("you need to you specify both system and reference, not just either") unless defined($system) && defined($reference); die("can't open system file $system") if ! -e $system; @SYSTEM = `cat $system`; chop(@SYSTEM); if (! -e $reference && -e $reference.".ref0") { for(my $i=0;-e $reference.".ref".$i;$i++) { my @REF = `cat $reference.ref$i`; chop(@REF); for(my $j=0;$j$dir/summary") or die "Cannot open: $!"; &best_matches(\%PRECISION_CORRECT,\%PRECISION_TOTAL,"$dir/n-gram-precision"); &best_matches(\%RECALL_CORRECT,\%RECALL_TOTAL,"$dir/n-gram-recall"); &bleu_annotation(); close(SUMMARY); } # segmentation if (defined($segmentation)) { if (defined($hierarchical)) { &hierarchical_segmentation(); } else { &segmentation(); } } # coverage analysis my (%INPUT_PHRASE,%CORPUS_COVERED,%TTABLE_COVERED,%TTABLE_ENTROPY); if (!defined($coverage_dir) && (defined($ttable) || defined($corpus))) { if (!defined($input)) { die("ERROR: when specifying either ttable or input-corpus, please also specify input\n"); } $MAX_LENGTH = 7; &input_phrases(); &ttable_coverage("0",$ttable) if defined($ttable); &corpus_coverage() if defined($corpus); &input_annotation(); # corpus coverage for non-surface factors if (defined($input_factors)) { for(my $factor=1;$factor<$input_factors;$factor++) { &input_phrases($factor); &corpus_coverage($factor); } } # factored ttable coverage foreach my $factored_ttable (@FACTORED_TTABLE) { die("factored ttable must be specified as factor:file -- $ttable") unless $factored_ttable =~ /^([\d,]+)\:(.+)/; # factor:ttable my ($factor,$file) = ($1,$2); next if defined($ttable) && $file eq $ttable; # no need to do this twice &input_phrases($factor); &ttable_coverage($factor,$file); } } if (defined($precision_by_coverage)) { &precision_by_coverage("ttable"); &precision_by_coverage("corpus"); } # bilingual concordance -- not used by experiment.perl if (defined($corpus) && defined($output_corpus) && defined($alignment) && defined($biconcor)) { `$biconcor -s $dir/biconcor -c $corpus -t $output_corpus -a $alignment`; } # process search graph for visualization if (defined($search_graph)) { &process_search_graph($search_graph); } sub best_matches { my ($CORRECT,$TOTAL,$out) = @_; my $type = ($out =~ /precision/) ? "precision" : "recall"; for(my $length=1;$length<=$MAX_LENGTH;$length++) { my ($total,$correct) = (0,0); open(OUT,">$out.$length") or die "Cannot open: $!"; foreach my $ngram (keys %{$$TOTAL{$length}}) { printf OUT "%d\t%d\t%s\n", $$TOTAL{$length}{$ngram}, $$CORRECT{$length}{$ngram}, $ngram; $total += $$TOTAL{$length}{$ngram}; $correct += $$CORRECT{$length}{$ngram}; } close(OUT); print SUMMARY "$type-$length-total: $total\n"; print SUMMARY "$type-$length-correct: $correct\n"; } } # get all the n-grams from the input corpus sub input_phrases { my ($factor) = (@_); %INPUT_PHRASE = (); open(INPUT,$input) or die "Can't read input $input"; while(my $line = ) { chop($line); $line = &get_factor_phrase($factor,$line); &extract_n_grams($line,\%INPUT_PHRASE); } close(INPUT); } # reduce a factorized phrase into the factors of interest sub get_factor_phrase { my ($factor,$line) = @_; # clean line $line =~ s/[\r\n]+//g; $line =~ s/\s+/ /; $line =~ s/^ //; $line =~ s/ $//; # only surface? delete remaining factors if (!defined($factor) || $factor eq "0") { $line =~ s/\|\S+//g; return $line; } my $factored_line = ""; # reduce each word foreach (split(/ /,$line)) { $factored_line .= &get_factor_word($factor,$_) . " "; } chop($factored_line); return $factored_line; } # reduce a factorized word into the factors of interest sub get_factor_word { my ($factor,$word) = @_; my @WORD = split(/\|/,$word); my $fword = ""; foreach (split(/,/,$factor)) { $fword .= $WORD[$_]."|"; } chop($fword); return $fword; } sub factor_ext { my ($factor) = @_; return "" if !defined($factor) || $factor eq "0"; return ".".$factor; } sub bleu_annotation { open(OUT,"| sort -r >$dir/bleu-annotation") or die "Cannot open: $!"; for(my $i=0;$i0; $ngram .= $WORD[$i+$n]; } $REF_NGRAM{$length}{$ngram}--; if ($REF_NGRAM{$length}{$ngram} >= 0) { $ngram_correct++; for(my $n=0;$n<$length;$n++) { $MATCH[$i+$n] = $length; } } } $bleu *= ($ngram_correct/(scalar(@WORD)-$length+2)); } $bleu = $bleu ** (1/4); my $ref_length = 9999; if (ref($REFERENCE[$i]) eq 'ARRAY') { foreach my $ref (@{$REFERENCE[$i]}) { my @RW = split(/ /,$ref); $ref_length = scalar(@RW) if scalar(@RW) < $ref_length; } } else { my @RW = split(/ /,$REFERENCE[$i]); $ref_length = scalar(@RW); } if (scalar(@WORD) < $ref_length && scalar(@WORD)>0) { $bleu *= exp(1-$ref_length/scalar(@WORD)); } printf OUT "%5.4f\t%d\t",$bleu,$i; for(my $i=0;$i $ref_count) ? $ref_count : $sys_count; $$CORRECT{$length}{$ngram} += $match_count; $$TOTAL{$length}{$ngram} += $sys_count; #print "$length:$ngram $sys_count $ref_count\n"; } } } sub ttable_coverage { my ($factor,$ttable) = @_; # open file if (! -e $ttable && -e $ttable.".gz") { open(TTABLE,"gzip -cd $ttable.gz|") or die "Cannot open: $!"; } elsif ($ttable =~ /.gz$/) { open(TTABLE,"gzip -cd $ttable|") or die "Cannot open: $!"; } else { open(TTABLE,$ttable) or die "Can't read ttable $ttable: $!"; } # create report file open(REPORT,">$dir/ttable-coverage-by-phrase".&factor_ext($factor)) or die "Cannot open: $!"; my ($last_in,$last_size,$size) = ("",0); my $p_e_given_f_score = 2; if ($score_options) { if ($score_options =~ /OnlyDirect/) { $p_e_given_f_score = 0; } elsif ($score_options =~ /NoLex/) { $p_e_given_f_score = 1; } } my @DISTRIBUTION = (); while() { chop; my @COLUMN = split(/ +\|\|\| +/); my ($in,$out,$scores) = @COLUMN; # handling hierarchical $in =~ s/ \[[^ \]]+\]$//; # remove lhs nt next if $in =~ /\[[^ \]]+\]\[[^ \]]+\]/; # only consider flat rules $in = &get_factor_phrase($factor,$in) if defined($factor) && $factor eq "0"; $scores = $COLUMN[4] if defined($hierarchical); #scalar @COLUMN == 5; my @IN = split(/ /,$in); $size = scalar @IN; next unless defined($INPUT_PHRASE{$size}{$in}); $TTABLE_COVERED{$size}{$in}++; my @SCORE = split(/ /,$scores); if ($in ne $last_in) { if ($last_in ne "") { my $entropy = &compute_entropy(@DISTRIBUTION); printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy; $TTABLE_ENTROPY{$last_size}{$last_in} = $entropy; @DISTRIBUTION = (); } $last_in = $in; $last_size = $size; } push @DISTRIBUTION, $SCORE[$p_e_given_f_score]; # forward probability } my $entropy = &compute_entropy(@DISTRIBUTION); printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy; $TTABLE_ENTROPY{$last_size}{$last_in} = $entropy; close(REPORT); close(TTABLE); &additional_coverage_reports($factor,"ttable",\%TTABLE_COVERED); } sub compute_entropy { my $z = 0; # normalization foreach my $p (@_) { $z += $p; } my $entropy = 0; foreach my $p (@_) { next if $p == 0; $entropy -= ($p/$z)*log($p/$z)/log(2); } return $entropy; } sub corpus_coverage { my ($factor) = @_; %CORPUS_COVERED = (); # compute how often input phrases occur in the corpus open(CORPUS,$corpus) or die "Can't read corpus $corpus"; while() { my $line = &get_factor_phrase($factor,$_); my @WORD = split(/ /,$line); my $sentence_length = scalar @WORD; for(my $start=0;$start < $sentence_length;$start++) { my $phrase = ""; for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) { $phrase .= " " if $length > 1; $phrase .= $WORD[$start+$length-1]; last if !defined($INPUT_PHRASE{$length}{$phrase}); $CORPUS_COVERED{$length}{$phrase}++; } } } close(CORPUS); # report occurrence counts for all known input phrases open(REPORT,">$dir/corpus-coverage-by-phrase".&factor_ext($factor)) or die "Cannot open: $!"; foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) { foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) { next unless defined $CORPUS_COVERED{$size}{$phrase}; printf REPORT "%s\t%d\n", $phrase, $CORPUS_COVERED{$size}{$phrase}; } } close(REPORT); &additional_coverage_reports($factor,"corpus",\%CORPUS_COVERED); } sub additional_coverage_reports { my ($factor,$name,$COVERED) = @_; # unknown word report ---- TODO: extend to rare words? open(REPORT,">$dir/$name-unknown".&factor_ext($factor)) or die "Cannot open: $!"; foreach my $phrase (keys %{$INPUT_PHRASE{1}}) { next if defined($$COVERED{1}{$phrase}); printf REPORT "%s\t%d\n",$phrase,$INPUT_PHRASE{1}{$phrase}; } close(REPORT); # summary report open(REPORT,">$dir/$name-coverage-summary".&factor_ext($factor)) or die "Cannot open: $!"; foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) { my (%COUNT_TYPE,%COUNT_TOKEN); foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) { my $covered = $$COVERED{$size}{$phrase}; $covered = 0 unless defined($covered); $COUNT_TYPE{$covered}++; $COUNT_TOKEN{$covered} += $INPUT_PHRASE{$size}{$phrase}; } foreach my $count (sort {$a <=> $b} keys %COUNT_TYPE) { printf REPORT "%d\t%d\t%d\t%d\n",$size,$count,$COUNT_TYPE{$count},$COUNT_TOKEN{$count}; } } close(REPORT); } sub input_annotation { open(OUT,">$dir/input-annotation") or die "Cannot open: $!";; open(INPUT,$input) or die "Can't read input $input"; while() { chop; s/\|\S+//g; # remove additional factors s/<\S[^>]*>//g; # remove xml markup s/\s+/ /g; s/^ //; s/ $//; # remove redundant spaces print OUT $_."\t"; my @WORD = split; my $sentence_length = scalar @WORD; for(my $start=0;$start < $sentence_length;$start++) { my $phrase = ""; for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) { $phrase .= " " if $length > 1; $phrase .= $WORD[$start+$length-1]; my $ttable_covered = $TTABLE_COVERED{$length}{$phrase}; my $corpus_covered = $CORPUS_COVERED{$length}{$phrase}; next unless defined($ttable_covered) || defined($corpus_covered); my $ttable_entropy = $TTABLE_ENTROPY{$length}{$phrase} || 0; #$ttable_entropy = 0 unless defined($ttable_entropy); $ttable_covered = 0 unless defined($ttable_covered); $corpus_covered = 0 unless defined($corpus_covered); if (defined($TTABLE_COVERED{$length}{$phrase})) { printf OUT "%d-%d:%d:%d:%.5f ",$start,$start+$length-1,$corpus_covered,$ttable_covered,$ttable_entropy; } } } print OUT "\n"; } close(INPUT); close(OUT); } sub extract_n_grams_arrayopt { my ($sentence,$NGRAM,$minmax) = @_; if (ref($sentence) eq 'ARRAY') { my %MINMAX_NGRAM; &extract_n_grams($$sentence[0],\%MINMAX_NGRAM); for(my $i=1;$i $SET_NGRAM{$length}{$ngram}) { $MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram}; } } } else { foreach my $ngram (keys %{$SET_NGRAM{$length}}) { if (!defined($MINMAX_NGRAM{$length}{$ngram}) || $SET_NGRAM{$length}{$ngram} > $MINMAX_NGRAM{$length}{$ngram}) { $MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram}; } } } } } for(my $length=1;$length<=$MAX_LENGTH;$length++) { foreach my $ngram (keys %{$MINMAX_NGRAM{$length}}) { $$NGRAM{$length}{$ngram} += $MINMAX_NGRAM{$length}{$ngram}; } } } else { &extract_n_grams($sentence,$NGRAM); } } sub extract_n_grams { my ($sentence,$NGRAM) = @_; $sentence =~ s/[\r\n]+//g; $sentence =~ s/\s+/ /g; $sentence =~ s/^ //; $sentence =~ s/ $//; my @WORD = split(/ /,$sentence); for(my $length=1;$length<=$MAX_LENGTH;$length++) { for(my $i=0;$i<=scalar(@WORD)-$length;$i++) { my $ngram = ""; for(my $n=0;$n<$length;$n++) { $ngram .= " " if $n>0; $ngram .= $WORD[$i+$n]; } $$NGRAM{$length}{$ngram}++; } } } sub precision_by_coverage { my ($coverage_type) = @_; my (%PREC_BY_WORD,%TOTAL_BY_WORD,%LENGTH_BY_WORD,%DELETED_BY_WORD); my (%PREC_BY_COVERAGE,%TOTAL_BY_COVERAGE,%LENGTH_BY_COVERAGE,%DELETED_BY_COVERAGE); my (%PREC_BY_FACTOR,%TOTAL_BY_FACTOR,%LENGTH_BY_FACTOR,%DELETED_BY_FACTOR); my (%PREC_BY_FACTOR_COVERAGE,%TOTAL_BY_FACTOR_COVERAGE,%LENGTH_BY_FACTOR_COVERAGE,%DELETED_BY_FACTOR_COVERAGE); # get coverage statistics my %COVERAGE; print STDERR "".(defined($coverage_dir)?$coverage_dir:$dir) ."/$coverage_type-coverage-by-phrase"; open(COVERAGE,(defined($coverage_dir)?$coverage_dir:$dir) ."/$coverage_type-coverage-by-phrase") or die "Cannot open: $!"; while() { chop; my ($phrase,$count) = split(/\t/); $COVERAGE{$phrase} = $count; } close(COVERAGE); # go through each line... open(FILE,$segmentation) || die("ERROR: could not open segmentation file $segmentation"); open(INPUT,$input) or die "Can't read input $input"; open(ALIGNMENT,$system_alignment) or die "Can't read output alignment file $system_alignment"; # get marked up output my $line_count = 0; while(my $line = ) { chop($line); # get corresponding input line my $input = ; my @INPUT = split(/ /,&get_factor_phrase(0,$input)); # surface my @FACTOR = split(/ /,&get_factor_phrase($precision_by_coverage_factor,$input)); # word alignment my $alignment = ; my %ALIGNED; foreach (split(/ /,$alignment)) { my ($input_pos,$output_pos) = split(/\-/,$_); push @{$ALIGNED{$input_pos}}, $output_pos; } # output words # @SYSTEM is already collected my @OUTPUT = split(/ /,$SYSTEM[$line_count]); # compute precision of each ngram # @REFERENCE (possibly multiple) is already collected my (%SYS_NGRAM,%REF_NGRAM,%PREC_NGRAM); &extract_n_grams( $SYSTEM[$line_count], \%SYS_NGRAM ); &extract_n_grams_arrayopt( $REFERENCE[$line_count++], \%REF_NGRAM, "max" ); foreach my $ngram (keys %{$SYS_NGRAM{1}}) { # note: only interested in unigram precision $PREC_NGRAM{1}{$ngram} = 0; if (defined($REF_NGRAM{1}) && defined($REF_NGRAM{1}{$ngram})) { my $ref_count = $REF_NGRAM{1}{$ngram}; my $sys_count = $SYS_NGRAM{1}{$ngram}; $PREC_NGRAM{1}{$ngram} = ($ref_count >= $sys_count) ? 1 : $ref_count/$sys_count; } } close(REPORT); # process one phrase at a time my $output_pos = 0; while($line =~ /([^|]+) \|(\d+)\-(\d+)\|\s*(.*)$/) { my ($output,$from,$to) = ($1,$2,$3); $line = $4; # bug fix: 1-1 unknown word mappings get alignment point if ($from == $to && # one scalar(split(/ /,$output)) == 1 && # to one !defined($ALIGNED{$from})) { # but not aligned push @{$ALIGNED{$from}},$output_pos; } $output_pos += scalar(split(/ /,$output)); # compute precision for each word for(my $i=$from; $i<=$to; $i++) { my $coverage = 0; $coverage = $COVERAGE{$INPUT[$i]} if defined($COVERAGE{$INPUT[$i]}); my ($precision,$deleted,$length) = (0,0,0); # unaligned? note as deleted if (!defined($ALIGNED{$i})) { $deleted = 1; } # aligned else { foreach my $o (@{$ALIGNED{$i}}) { $precision += $PREC_NGRAM{1}{$OUTPUT[$o]}; } $precision /= scalar(@{$ALIGNED{$i}}); # average, if multi-aligned $length = scalar(@{$ALIGNED{$i}}); } my $word = $INPUT[$i]; $word .= "\t".$FACTOR[$i] if $precision_by_coverage_factor; $DELETED_BY_WORD{$word} += $deleted; $PREC_BY_WORD{$word} += $precision; $LENGTH_BY_WORD{$word} += $length; $TOTAL_BY_WORD{$word}++; $DELETED_BY_COVERAGE{$coverage} += $deleted; $PREC_BY_COVERAGE{$coverage} += $precision; $LENGTH_BY_COVERAGE{$coverage} += $length; $TOTAL_BY_COVERAGE{$coverage}++; if ($precision_by_coverage_factor) { $DELETED_BY_FACTOR{$FACTOR[$i]} += $deleted; $DELETED_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $deleted; $PREC_BY_FACTOR{$FACTOR[$i]} += $precision; $PREC_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $precision; $LENGTH_BY_FACTOR{$FACTOR[$i]} += $length; $LENGTH_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $length; $TOTAL_BY_FACTOR{$FACTOR[$i]}++; $TOTAL_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage}++; } } } } close(FILE); open(REPORT,">$dir/precision-by-$coverage_type-coverage") or die "Cannot open: $!"; foreach my $coverage (sort {$a <=> $b} keys %TOTAL_BY_COVERAGE) { printf REPORT "%d\t%.3f\t%d\t%d\t%d\n", $coverage, $PREC_BY_COVERAGE{$coverage}, $DELETED_BY_COVERAGE{$coverage}, $LENGTH_BY_COVERAGE{$coverage}, $TOTAL_BY_COVERAGE{$coverage}; } close(REPORT); open(REPORT,">$dir/precision-by-input-word") or die "Cannot open: $!"; foreach my $word (keys %TOTAL_BY_WORD) { my ($w,$f) = split(/\t/,$word); my $coverage = 0; $coverage = $COVERAGE{$w} if defined($COVERAGE{$w}); printf REPORT "%.3f\t%d\t%d\t%d\t%d\t%s\n", $PREC_BY_WORD{$word}, $DELETED_BY_WORD{$word}, $LENGTH_BY_WORD{$word}, $TOTAL_BY_WORD{$word},$coverage,$word; } close(REPORT); if ($precision_by_coverage_factor) { open(REPORT,">$dir/precision-by-$coverage_type-coverage.$precision_by_coverage_factor") or die "Cannot open: $!"; foreach my $factor (sort keys %TOTAL_BY_FACTOR_COVERAGE) { foreach my $coverage (sort {$a <=> $b} keys %{$TOTAL_BY_FACTOR_COVERAGE{$factor}}) { printf REPORT "%s\t%d\t%.3f\t%d\t%d\t%d\n", $factor, $coverage, $PREC_BY_FACTOR_COVERAGE{$factor}{$coverage}, $DELETED_BY_FACTOR_COVERAGE{$factor}{$coverage}, $LENGTH_BY_FACTOR_COVERAGE{$factor}{$coverage}, $TOTAL_BY_FACTOR_COVERAGE{$factor}{$coverage}; } } close(REPORT); } } sub segmentation { my %SEGMENTATION; open(FILE,$segmentation) || die("ERROR: could not open segmentation file $segmentation"); open(OUT,">$dir/segmentation-annotation") or die "Cannot open: $!"; while() { chop; my $count=0; my $out = -1; foreach (split) { if (/^\|(\d+)\-(\d+)\|$/) { print OUT " " unless $out-($count-1) == 0; printf OUT "%d:%d:%d:%d",$1,$2,$out-($count-1),$out; my $in_count = $2-$1+1; $SEGMENTATION{$in_count}{$count}++; $count = 0; } else { $out++; $count++; } } print OUT "\n"; } close(OUT); close(FILE); open(SUMMARY,">$dir/segmentation") or die "Cannot open: $!"; foreach my $in (sort { $a <=> $b } keys %SEGMENTATION) { foreach my $out (sort { $a <=> $b } keys %{$SEGMENTATION{$in}}) { printf SUMMARY "%d\t%d\t%d\n", $in, $out, $SEGMENTATION{$in}{$out}; } } close(SUMMARY); # TODO: error by segmentation } # analyze the trace file to collect statistics over the # hierarchical derivations and also create segmentation annotation sub hierarchical_segmentation { my $last_sentence = -1; my @DERIVATION; my %STATS; open(TRACE,$segmentation.".trace") or die "Cannot open: $!"; open(INPUT_TREE,">$dir/input-tree") or die "Cannot open: $!"; open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!"; open(NODE,">$dir/node") or die "Cannot open: $!"; while() { my $sentence; my %ITEM; &hs_scan_line($_, \$sentence, \%ITEM) || die("cannot scan line $_"); if ($last_sentence >= 0 && $sentence != $last_sentence) { &hs_process($last_sentence,\@DERIVATION,\%STATS); @DERIVATION = (); } push @DERIVATION,\%ITEM; $last_sentence = $sentence; } &hs_process($last_sentence,\@DERIVATION,\%STATS); close(TRACE); close(NODE); close(INPUT_TREE); close(OUTPUT_TREE); open(SUMMARY,">$dir/rule") or die "Cannot open: $!"; print SUMMARY "sentence-count\t".(++$last_sentence)."\n"; print SUMMARY "glue-rule\t".$STATS{'glue-rule'}."\n"; print SUMMARY "depth\t".$STATS{'depth'}."\n"; foreach (keys %{$STATS{'rule-type'}}) { print SUMMARY "rule\t$_\t".$STATS{'rule-type'}{$_}."\n"; } close(SUMMARY); } # scan a single line of the trace file sub hs_scan_line { my ($line,$ref_sentence,$ref_item) = @_; if ($line =~ /^Trans Opt/) { # Old format $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || return 0; my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7); ${$ref_sentence} = $sentence; $ref_item->{'start'} = $start; $ref_item->{'end'} = $end; $ref_item->{'rule_lhs'} = $rule_lhs; $rule_rhs =~ s//>/g; @{$ref_item->{'rule_rhs'}} = split(/ /,$rule_rhs); foreach (split(/ /,$alignment)) { /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n"); $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span $ref_item->{'alignedSpan'}{$1} = 1; } @{$ref_item->{'spans'}} = (); foreach my $span (reverse split(/\s+/,$spans)) { $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n"); my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 ); push @{$ref_item->{'spans'}}, \%SPAN; } } else { # New format $line =~ /^(\d+) \|\|\| \[\S+\] -> (.+) \|\|\| \[(\S+)\] -> (.+) \|\|\| (.*)\|\|\| (.*)/ || return 0; my ($sentence,$source_rhs,$target_lhs,$target_rhs,$alignment,$source_spans) = ($1,$2,$3,$4,$5,$6); ${$ref_sentence} = $sentence; @{$ref_item->{'spans'}} = (); foreach (split(/ /,$source_rhs)) { /^\[?([^\]]+)\]?$/; my %SPAN = ( 'word' => $1 ); push @{$ref_item->{'spans'}}, \%SPAN; } my $i = 0; foreach my $span (split(/ /,$source_spans)) { $span =~ /(\d+)\.\.(\d+)/ || die("funny span: $span\n"); $ref_item->{'spans'}[$i]{'from'} = $1; $ref_item->{'spans'}[$i]{'to'} = $2; if ($i == 0) { $ref_item->{'start'} = $1; } $ref_item->{'end'} = $2; $i++; } $ref_item->{'rule_lhs'} = $target_lhs; $target_rhs =~ s//>/g; @{$ref_item->{'rule_rhs'}} = (); foreach (split(/ /,$target_rhs)) { /^\[?([^\]]+)\]?$/; push @{$ref_item->{'rule_rhs'}}, $1; } foreach (split(/ /,$alignment)) { /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n"); $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span $ref_item->{'alignedSpan'}{$1} = 1; } } return 1; } # process a single sentence for hierarchical segmentation sub hs_process { my ($sentence,$DERIVATION,$STATS) = @_; my $DROP_RULE = shift @{$DERIVATION}; # get rid of S -> S my $max = $$DERIVATION[0]{'end'}; # consolidate glue rules into one rule my %GLUE_RULE; $GLUE_RULE{'start'} = 1; $GLUE_RULE{'end'} = $max; $GLUE_RULE{'rule_lhs'} = "S"; $GLUE_RULE{'depth'} = 0; my $x=0; while(1) { my $RULE = shift @{$DERIVATION}; if (scalar(@{$$RULE{'rule_rhs'}}) == 2 && ($$RULE{'rule_lhs'} eq "S" && $$RULE{'rule_rhs'}[0] eq "S" && $$RULE{'rule_rhs'}[1] eq "X") || ($$RULE{'rule_lhs'} eq "Q" && $$RULE{'rule_rhs'}[0] eq "Q")) { unshift @{$GLUE_RULE{'spans'}},$$RULE{'spans'}[1]; push @{$GLUE_RULE{'rule_rhs'}}, $$RULE{'rule_rhs'}[1]; $GLUE_RULE{'alignment'}{$x} = $x; $GLUE_RULE{'alignedSpan'}{$x} = 1; $x++; } else { unshift @{$DERIVATION}, $RULE; last; } } unshift @{$DERIVATION}, \%GLUE_RULE; $$STATS{'glue-rule'} += $x; # create chart my %CHART; foreach my $RULE (@{$DERIVATION}) { $CHART{$$RULE{'start'}}{$$RULE{'end'}} = $RULE; } # compute depth &hs_compute_depth(1,$max,0,\%CHART); my $max_depth = 0; foreach my $RULE (@{$DERIVATION}) { next unless defined($$RULE{'depth'}); # better: delete offending rule S -> S $max_depth = $$RULE{'depth'} if $$RULE{'depth'} > $max_depth; } &hs_recompute_depth(1,$max,\%CHART,$max_depth); $$STATS{'depth'} += $max_depth; # build matrix of divs my @MATRIX; &hs_create_out_span(1,$max,\%CHART,\@MATRIX); print OUTPUT_TREE &hs_output_matrix($sentence,\@MATRIX,$max_depth); my @MATRIX_IN; &hs_create_in_span(1,$max,\%CHART,\@MATRIX_IN); print INPUT_TREE &hs_output_matrix($sentence,\@MATRIX_IN,$max_depth); # number rules and get their children my $id = 0; foreach my $RULE (@{$DERIVATION}) { next unless defined($$RULE{'start_div'}); # better: delete offending rule S -> S $$STATS{'rule-type'}{&hs_rule_type($RULE)}++ if $id>0; $$RULE{'id'} = $id++; } &hs_get_children(1,$max,\%CHART); foreach my $RULE (@{$DERIVATION}) { next unless defined($$RULE{'start_div'}); # better: delete offending rule S -> S print NODE $sentence." "; print NODE $$RULE{'depth'}." "; print NODE $$RULE{'start_div'}." ".$$RULE{'end_div'}." "; print NODE $$RULE{'start_div_in'}." ".$$RULE{'end_div_in'}." "; print NODE join(",",@{$$RULE{'children'}})."\n"; } } sub hs_output_matrix { my ($sentence,$MATRIX,$max_depth) = @_; my @OPEN; my $out = ""; for(my $d=0;$d<=$max_depth;$d++) { push @OPEN, 0; } foreach my $SPAN (@$MATRIX) { $out .= $sentence."\t"; for(my $d=0;$d<=$max_depth;$d++) { my $class = " "; my $closing_flag = 0; if (defined($$SPAN{'closing'}) && defined($$SPAN{'closing'}{$d})) { $closing_flag = 1; } if ($d == $$SPAN{'depth'}) { if (defined($$SPAN{'opening'}) && $closing_flag) { $class = "O"; } elsif(defined($$SPAN{'opening'})) { $class = "["; } elsif($closing_flag) { $class = "]"; } else { $class = "-"; } } elsif ($closing_flag) { $class = "]"; } elsif ($OPEN[$d]) { $class = "-"; } $out .= $class; } $out .= "\t"; $out .= $$SPAN{'lhs'} if defined($$SPAN{'lhs'}); $out .= "\t"; $out .= $$SPAN{'rhs'} if defined($$SPAN{'rhs'}); $out .= "\n"; $OPEN[$$SPAN{'depth'}] = 1 if defined($$SPAN{'opening'}); if(defined($$SPAN{'closing'})) { for(my $d=$max_depth;$d>=0;$d--) { $OPEN[$d] = 0 if defined($$SPAN{'closing'}{$d}); } } } return $out; } sub hs_rule_type { my ($RULE) = @_; my $type = ""; # output side my %NT; my $total_word_count = 0; my $word_count = 0; my $nt_count = 0; for(my $i=0;$i 0; $word_count = 0; my $nt = chr(97+$nt_count++); $NT{$$RULE{'alignment'}{$i}} = $nt; $type .= $nt; } else { $word_count++; $total_word_count++; } } $type .= $word_count if $word_count > 0; $type .= ":".$total_word_count.":".$nt_count.":"; # input side $word_count = 0; $total_word_count = 0; for(my $i=0;$i 0; $word_count = 0; $type .= $NT{$i}; } else { $word_count++; $total_word_count++; } } $type .= $word_count if $word_count > 0; $type .= ":".$total_word_count; return $type; } # compute depth of each node sub hs_compute_depth { my ($start,$end,$depth,$CHART) = @_; if (!defined($$CHART{$start}{$end})) { print STDERR "warning: illegal span ($start,$end)\n"; return; } my $RULE = $$CHART{$start}{$end}; $$RULE{'depth'} = $depth; for(my $i=0;$i) { my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score); if (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] core/) { ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12); $heuristic_rule_score = $rule_score; # hmmmm.... } else { die("ERROR: buggy search graph line: $_"); } chop($alignment) if $alignment; chop($children) if $children; $recomb = 0 unless $recomb; $children = "" unless defined $children; $alignment = "" unless defined $alignment; if ($last_sentence != $sentence) { close(SENTENCE) if $sentence; open(SENTENCE,">$dir/search-graph/graph.$sentence"); $last_sentence = $sentence; } print SENTENCE "$id\t$recomb\t$from\t$to\t$output\t$alignment\t$children\t$rule_score\t$heuristic_rule_score\t$hyp_score\t$lhs\n"; } close(OSG); close(SENTENCE); }