From 20f49a1ded38d07ea2cb6918aa27ce9f934a36e6 Mon Sep 17 00:00:00 2001 From: eherbst Date: Mon, 14 Aug 2006 16:09:21 +0000 Subject: fixed legend display git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@723 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/analysis/sentence-by-sentence.pl | 384 ++++++++++++++++++------------ scripts/analysis/smtgui/README | 11 + scripts/analysis/smtgui/file-descriptions | 1 + scripts/analysis/smtgui/file-factors | 2 + 4 files changed, 248 insertions(+), 150 deletions(-) (limited to 'scripts/analysis') diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl index 7ca053981..b6e179dd3 100755 --- a/scripts/analysis/sentence-by-sentence.pl +++ b/scripts/analysis/sentence-by-sentence.pl @@ -1,100 +1,136 @@ #!/usr/bin/perl -w #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors -#usage: sentence-by-sentence SYSOUT REFERENCE > sentences.html +#usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.html use strict; use Getopt::Long; -my $sourcefile = undef; # source text +my $sourcefile = undef; GetOptions( - "source=s" => \$sourcefile, + "source=s" => \$sourcefile, ) or exit(1); -my ($sysoutfile, $truthfile) = @ARGV; +my ($sysoutfile, @truthfiles) = @ARGV; -if (!defined $sysoutfile || !defined $truthfile) { - print STDERR "usage: $0 hypothesis reference > sentence-by-sentence.html +if (!defined $sysoutfile || scalar(@truthfiles) == 0) { + print STDERR " +usage: $0 system_output reference(s...) > sentence-by-sentence.html Options: - --source=STRING ... source sentences + --source STRING ... foreign original + +N-grams are colored by the number of supporting references: + red for fewest, green for most, mediate shades otherwise. "; - exit 1; + exit(1); } -open(SYSOUT, "<$sysoutfile") or die "couldn't open '$sysoutfile' for read\n"; -open(TRUTH, "<$truthfile") or die "couldn't open '$truthfile' for read\n"; +my @TRUTHS = () x scalar(@truthfiles); +for(my $i = 0; $i < scalar(@truthfiles); $i++) +{ + open($TRUTHS[$i], "<$truthfiles[$i]") or die "couldn't open '$truthfiles[$i]' for read: $!\n"; + binmode($TRUTHS[$i], ":utf8"); +} +open(SYSOUT, "<$sysoutfile") or die "couldn't open '$sysoutfile' for read: $!\n"; binmode(SYSOUT, ":utf8"); -binmode(TRUTH, ":utf8"); binmode(STDOUT, ":utf8"); -if (defined $sourcefile) { - open(SOURCE, "<$sourcefile") or die "couldn't open '$sourcefile' for read\n"; - binmode(SOURCE, ":utf8"); +if (defined $sourcefile) +{ + open(SOURCE, "<$sourcefile") or die "couldn't open '$sourcefile' for read: $!\n"; + binmode(SOURCE, ":utf8"); } my @bleuScores; my @htmlSentences; my @htmlColors = ('#99ff99', '#aaaaff', '#ffff99', '#ff9933', '#ff9999'); #color sentences by rank (split in n tiers) -my @ngramColors = ('#ffff99', '#ff9933', '#99ff99', '#aaaaff', '#ff99ff'); #to be colorful and tell consecutive n-grams apart on the display +my $ngramSingleRefColor = '#aaffaa'; +my @ngramMultirefColors = ('ff9999', 'ff9933', 'ffff99', 'a0a0ff', '99ff99'); #arbitrary-length list; first entry is used for worst n-grams my $i = 0; while(my $sLine = ) { - my $eLine = ; - die "$truthfile shorter than $sysoutfile!" if !defined $eLine; - chomp $sLine; chomp $eLine; - $sLine =~ s/^\s*|\s*$//g; - $eLine =~ s/^\s*|\s*$//g; - my @sWords = split(/\s+/, $sLine); - my @eWords = split(/\s+/, $eLine); - my @sFactors = map {my @f = split(/\|/, $_); \@f;} @sWords; - my @eFactors = map {my @f = split(/\|/, $_); \@f;} @eWords; - my @sourceFactors; - if (defined $sourcefile) { - my $sourceLine = ; - die "$sourcefile shorter than $sysoutfile!" if !defined $sourceLine; - chomp $sourceLine; - $sourceLine =~ s/^\s*|\s*$//g; - my @sourceWords = split(/\s+/, $sourceLine); - @sourceFactors = map {my @f = split(/\|/, $_); \@f;} @sourceWords; - } - + my @sFactors = @{extractFactorArrays($sLine)}; + my @eLines = () x scalar(@truthfiles); + my @eFactors; + for(my $j = 0; $j < scalar(@truthfiles); $j++) + { + my $fh = $TRUTHS[$j]; + $eLines[$j] = <$fh>; + push @eFactors, extractFactorArrays($eLines[$j], "$truthfiles[$j] shorter than $sysoutfile"); + } + my $sourceFactors; + if (defined $sourcefile) + { + my $sourceLine = ; + $sourceFactors = extractFactorArrays($sourceLine, "$sourcefile shorter than $sysoutfile"); + } + my $bleuData = getBLEUSentenceDetails(\@sFactors, \@eFactors, 0); push @bleuScores, [$i, $bleuData->[0]->[0], 0]; #the last number will be the rank my $pwerData = getPWERSentenceDetails(\@sFactors, \@eFactors, 0); - my $html = "
"; #the %%%% is a flag to be replaced - $html .= "
Sentence $i) BLEU: " . sprintf("%.4lg", $bleuData->[0]->[0]) . " (" . join('/', map {sprintf("%.4lg", $_)} @{$bleuData->[0]}[1 .. 4]) . ")
\n"; - $html .= "

Source

" . getFactoredSentenceHTML(\@sourceFactors) . "
\n" if defined $sourcefile; - $html .= "

Reference

" . getFactoredSentenceHTML(\@eFactors) . "
\n"; + my $html = "
"; #the %%%% and other tokens like it are flags to be replaced + $html .= "
Sentence $i)     BLEU: " . sprintf("%.4lg", $bleuData->[0]->[0]) . " (" . join('/', map {sprintf("%.4lg", $_)} @{$bleuData->[0]}[1 .. 4]) . ")
\n"; + if(defined $sourcefile) + { + $html .= "\n"; + } + for(my $j = 0; $j < scalar(@truthfiles); $j++) + { + $html .= "\n"; + } my $j = 0; - $html .= "

System Output

(PWER errors marked)

" . getFactoredSentenceHTML(\@sFactors, $pwerData) . "
\n"; + $html .= "\n"; $j = 0; - $html .= "

N-grams

" . getAllNgramsHTML(\@sFactors, $bleuData->[1]) . "
\n"; - $html .= "\n"; + $html .= "\n"; + $html .= "
Source" . getFactoredSentenceHTML($sourceFactors) . "
Ref $j" . getFactoredSentenceHTML($eFactors[$j]) . "
Output" . getFactoredSentenceHTML(\@sFactors, $pwerData) . "
N-grams" . getAllNgramsHTML(\@sFactors, $bleuData->[1], scalar(@truthfiles)) . "
\n"; push @htmlSentences, $html; $i++; } close(SYSOUT); -close(TRUTH); +foreach my $truthfh (@TRUTHS) {close($truthfh);} rankSentencesByBLEU(\@bleuScores); my $stylesheet = < -h2 {font-weight: bold; font-size: large; margin-bottom: 12px} -h4 {font-weight: bold; font-size: small} -#legend {background-color: #fff; border: 1px solid #000; padding: 2px; margin-bottom: 15px} -#legend_title {font-weight: bold; font-size: medium; text-decoration: underline} -div.sentence {background-color: #ffffee; border: 1px solid #000088; padding: 0px 8px 0px 8px} //entire composition +.legend {background-color: #fff; border: 1px solid #000; padding: 2px; margin-bottom: 10px; margin-right: 15px} +.legend_title {font-weight: bold; font-size: medium; text-decoration: underline} +div.sentence {background-color: #ffffee; border: 1px solid #000088; padding: 0px 8px 0px 8px} //entire composition for a given sentence +div.sentence td {margin: 8px 0px 8px 0px} div.bleu_report {margin-bottom: 5px} -div.source_sentence {background-color: #ffcccc; border: 1px solid #bbb; margin: 8px 0px 8px 0px} -div.truth_sentence {background-color: #ccffcc; border: 1px solid #bbb; margin: 8px 0px 8px 0px} -div.sysout_sentence {background-color: #ccccff; border: 1px solid #bbb; margin: 8px 0px 8px 0px} +td.sent_title {font-weight: bold; font-size: medium; margin-bottom: 12px} +.source_sentence {background-color: #ffcccc; border: 1px solid #bbb} +.truth_sentence {background-color: #ccffcc; border: 1px solid #bbb} +.sysout_sentence {background-color: #ccccff; border: 1px solid #bbb} table.sentence_table {border: none} -div.sysout_ngrams {background-color: #fff; border: 1px solid #bbb; margin-top: 8px; margin-bottom: 8px} +.sysout_ngrams {background-color: #fff; border: 1px solid #bbb} table.ngram_table {} td.ngram_cell {padding: 1px} EOHTML print "\n"; print "\n"; -print "$sysoutfile vs. $truthfile: Sentence-by-sentence Comparison$stylesheet\n"; +print "$sysoutfile vs. [" . join(' ', @truthfiles) . "]: Sentence-by-sentence Comparison$stylesheet\n"; + +#javascript to sort by BLEU, by order in corpus, ... +my %rank2index = map {$bleuScores[$_]->[2] => $_} (0 .. scalar(@htmlSentences) - 1); +print "\n"; #legend for background colors my @minBLEU = (1e9) x scalar(@htmlColors); @@ -105,25 +141,35 @@ for(my $k = 0; $k < scalar(@htmlSentences); $k++) if($bleuScores[$k]->[1] < $minBLEU[$tier]) {$minBLEU[$tier] = $bleuScores[$k]->[1];} elsif($bleuScores[$k]->[1] > $maxBLEU[$tier]) {$maxBLEU[$tier] = $bleuScores[$k]->[1];} } -print "
BLEU Ranges"; +print "
BLEU Ranges (sentence backgrounds)"; for(my $k = 0; $k < scalar(@htmlColors); $k++) { print ""; } -print "
" . sprintf("%.4lg", $minBLEU[$k]) . " - " . sprintf("%.4lg", $maxBLEU[$k]) . "
\n"; +print "
\n"; +print "
N-gram Colors => Number of Matching Reference Translations"; +for(my $k = 1; $k <= scalar(@truthfiles); $k++) +{ + print ""; +} +print "
$k
+PWER errors are marked in red on output sentence displays.
+
Sort by BLEU score | corpus order (default)
\n"; #sentence boxes +print "
"; my $j = 0; foreach my $sentenceHTML (@htmlSentences) { - if($j > 0) {print "
";} + print "
"; + print "
"; my $bgcolor = getSentenceBGColorHTML($bleuScores[$j], $i); #i is now # of sentences $sentenceHTML =~ s/%%%%/$bgcolor/; - print "$sentenceHTML\n"; + print "$sentenceHTML
\n"; $j++; } -print ""; +print "
"; ##################### utils ##################### @@ -139,111 +185,153 @@ sub max my ($a, $b) = @_; return ($a > $b) ? $a : $b; } +#arguments: a list of elements +#return undef for an empty list, the max element otherwise +sub maxN +{ + if(scalar @_ == 0) {return undef;} + my $val = shift @_; + foreach my $e (@_) {if($e > $val) {$val = $e;}} + return $val; +} #arguments: x sub my_log { return -9999999999 unless $_[0]; return log($_[0]); } +#arguments: x +sub round +{ + my $x = shift; + return ($x - int($x) < .5) ? int($x) : int($x) + 1; +} ############################################################################################################################################################### -#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentence (same), factor index to use -#return: arrayref of [arrayref of [overall BLEU score, n-gram precisions], arrayref of matching n-gram [start index, length]] +#arguments: line read from corpus file, (optionally) string to die with if line isn't defined (default die-msg is empty) +#return: sentence (arrayref of arrayrefs of factor strings) taken from line +sub extractFactorArrays +{ + my ($line, $msg) = (shift, ''); + $msg = shift if scalar(@_); + die $msg if !defined $line; + chomp $line; + $line =~ s/^\s*|\s*$//g; #added by Ondrej to handle moses-mert-parallel output + my @words = split(/\s+/, $line); + my @factors = map {my @f = split(/\|/, $_); \@f;} @words; + return \@factors; +} + +#can handle multiple reference translations; assume at least one +#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use +#return: arrayref of [arrayref of [overall BLEU score, n-gram precisions], arrayref of matching n-gram [start index, length, arrayref of indices of matching references]] sub getBLEUSentenceDetails { - my ($refSysOutput, $refTruth, $factorIndex) = @_; - my ($length_reference, $length_translation) = (scalar(@$refTruth), scalar(@$refSysOutput)); - my ($correct1, $correct2, $correct3, $correct4, $total1, $total2, $total3, $total4) = (0, 0, 0, 0, 0, 0, 0, 0); + my $maxNgramOrder = 4; + my ($refSysOutput, $refTruths, $factorIndex) = @_; + my $length_translation = scalar(@$refSysOutput); #length of sysout sentence + my @length_references = map {scalar(@$_)} @$refTruths; + my $closestTruthLength = (sort(map {abs($_ - $length_translation)} @length_references))[0]; + my @correct = (0) x $maxNgramOrder; #n-gram counts + my @total = (0) x $maxNgramOrder; #n-gram counts my $returnData = [[], []]; - my %REF_GRAM = (); + my %REF_GRAM; #hash from n-gram to arrayref with # of times found in each reference my $ngramMatches = []; #arrayref of n-gram [start index, length] - my ($i, $gram); - for($i = 0; $i < $length_reference; $i++) + for(my $j = 0; $j < scalar(@$refTruths); $j++) { - $gram = $refTruth->[$i]->[$factorIndex]; - $REF_GRAM{$gram}++; - next if $i<1; - $gram = $refTruth->[$i - 1]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - next if $i<2; - $gram = $refTruth->[$i - 2]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - next if $i<3; - $gram = $refTruth->[$i - 3]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; + for(my $i = 0; $i < $length_references[$j]; $i++) + { + my $gram = ''; + for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders + { + $gram = $refTruths->[$j]->[$i - $k]->[$factorIndex] . " " . $gram; + #increment the count for the given n-gram and given reference number + if(!exists $REF_GRAM{$gram}) + { + my @tmp = (0) x scalar @$refTruths; + $tmp[$j] = 1; + $REF_GRAM{$gram} = \@tmp; + } + else + { + $REF_GRAM{$gram}->[$j]++; + } + } + } } - for($i = 0; $i < $length_translation; $i++) + for(my $i = 0; $i < $length_translation; $i++) { - $gram = $refSysOutput->[$i]->[$factorIndex]; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - $correct1++; - push @$ngramMatches, [$i, 1]; - } - next if $i<1; - $gram = $refSysOutput->[$i - 1]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - $correct2++; - push @$ngramMatches, [$i - 1, 2]; - } - next if $i<2; - $gram = $refSysOutput->[$i - 2]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - $correct3++; - push @$ngramMatches, [$i - 2, 3]; - } - next if $i<3; - $gram = $refSysOutput->[$i - 3]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - $correct4++; - push @$ngramMatches, [$i - 3, 4]; - } + my $gram = ''; + for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders + { + $gram = $refSysOutput->[$i - $k]->[$factorIndex] . " " . $gram; + if(exists $REF_GRAM{$gram}) #this n-gram was found in at least one reference + { + $correct[$k]++; + my @indices = (); + for(my $m = 0; $m < scalar(@{$REF_GRAM{$gram}}); $m++) + { + if($REF_GRAM{$gram}->[$m] > 0) + { + push @indices, $m; + $REF_GRAM{$gram}->[$m]--; + } + } + push @$ngramMatches, [$i - $k, $k + 1, \@indices]; + } + } } - my $total = $length_translation; - $total1 = max(1, $total); - $total2 = max(1, $total - 1); - $total3 = max(1, $total - 2); - $total4 = max(1, $total - 3); - my $brevity = ($length_translation > $length_reference || $length_translation == 0) ? 1 : exp(1 - $length_reference / $length_translation); - my ($pct1, $pct2, $pct3, $pct4) = ($total1 == 0 ? -1 : $correct1 / $total1, $total2 == 0 ? -1 : $correct2 / $total2, - $total3 == 0 ? -1 : $correct3 / $total3, $total4 == 0 ? -1 : $correct4 / $total4); + my $brevity = ($length_translation > $closestTruthLength || $length_translation == 0) ? 1 : exp(1 - $closestTruthLength / $length_translation); + my @pct; my ($logsum, $logcount) = (0, 0); - if($total1 > 0) {$logsum += my_log($pct1); $logcount++;} - if($total2 > 0) {$logsum += my_log($pct2); $logcount++;} - if($total3 > 0) {$logsum += my_log($pct3); $logcount++;} - if($total4 > 0) {$logsum += my_log($pct4); $logcount++;} + for(my $i = 0; $i < $maxNgramOrder; $i++) + { + $total[$i] = max(1, $length_translation - $i); + push @pct, ($total[$i] == 0) ? -1 : $correct[$i] / $total[$i]; + if($total[$i] > 0) + { + $logsum += my_log($pct[$i]); + $logcount++; + } + } my $bleu = $brevity * exp($logsum / $logcount); - $returnData->[0] = [$bleu, $pct1, $pct2, $pct3, $pct4]; + $returnData->[0] = [$bleu, @pct]; $returnData->[1] = $ngramMatches; return $returnData; } -#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentence (same), factor index to use +#can handle multiple sentence translations; assume at least one +#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use #return: hashref of sysout word index => whether word matches sub getPWERSentenceDetails { - my ($refSysOutput, $refTruth, $factorIndex) = @_; - my $indices = {}; - my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); - my @truthWordUsed = (0) x $eLength; #array of 0/1; can only match a given truth word once - for(my $j = 0; $j < $sLength; $j++) + my ($refSysOutput, $refTruths, $factorIndex) = @_; + my $matches = {}; + my %truthWords; #hash from word to arrayref with number of times seen in each reference (but later holds only the max) + for(my $i = 0; $i < scalar(@$refTruths); $i++) { - $indices->{$j} = 0; - for(my $k = 0; $k < $eLength; $k++) #check output word against entire truth sentence + foreach my $eWord (@{$refTruths->[$i]}) { - if(lc $refSysOutput->[$j]->[$factorIndex] eq lc $refTruth->[$k]->[$factorIndex] && $truthWordUsed[$k] == 0) - { - $truthWordUsed[$k] = 1; - $indices->{$j} = 1; - last; - } + my $factor = $eWord->[$factorIndex]; + if(exists $truthWords{$factor}) {$truthWords{$factor}->[$i]++;} + else {my @tmp = (0) x scalar(@$refTruths); $tmp[$i] = 1; $truthWords{$factor} = \@tmp;} } } - return $indices; + %truthWords = map {$_ => maxN(@{$truthWords{$_}})} (keys %truthWords); #save only the max times each word is seen in a reference + for(my $j = 0; $j < scalar(@$refSysOutput); $j++) + { + if(exists $truthWords{$refSysOutput->[$j]->[$factorIndex]} && $truthWords{$refSysOutput->[$j]->[$factorIndex]} > 0) + { + $truthWords{$refSysOutput->[$j]->[$factorIndex]}--; + $matches->{$j} = 1; + } + else + { + $matches->{$j} = 0; + } + } + return $matches; } #assign ranks to sentences by BLEU score @@ -287,11 +375,12 @@ sub getSentenceBGColorHTML #display all matching n-grams in the given sentence, with all 1-grams on one line, then arranged by picking, for each, the first line on which it fits, # where a given word position can only be filled by one n-gram per line, so that all n-grams can be shown -#arguments: sentence (arrayref of arrayrefs of factor strings), arrayref of arrayrefs of matching n-gram [start, length] +#arguments: sentence (arrayref of arrayrefs of factor strings), arrayref of arrayrefs of matching n-gram [start, length, arrayref of matching reference indices], +# number of reference translations #return: HTML string sub getAllNgramsHTML { - my ($sentence, $ngrams) = @_; + my ($sentence, $ngrams, $numTruths) = @_; my $factorIndex = 0; my @table = (); #array or arrayrefs each of which represents a line; each position has the index of the occupying n-gram, or -1 if none my $n = 0; #n-gram index @@ -325,14 +414,12 @@ sub getAllNgramsHTML my $html = ""; my $numWords = scalar(@$sentence); - my ($curRow, $curCol) = (0, 0); - my $colorIndex = 0; + my ($curRow, $curCol) = (0, 0); #address in table $html .= ""; - foreach my $ngram (sort {my $c = $a->[2] <=> $b->[2]; if($c == 0) {$a->[0] <=> $b->[0]} else {$c}} @$ngrams) #sort by row, then word num + foreach my $ngram (sort {my $c = $a->[3] <=> $b->[3]; if($c == 0) {$a->[0] <=> $b->[0]} else {$c}} @$ngrams) #sort by row, then word num { - while($ngram->[0] > $curCol || $ngram->[2] > $curRow) {$html .= ""; $curCol = ($curCol + 1) % $numWords; if($curCol == 0) {$html .= ""; $curRow++;}} - $html .= ""; - $colorIndex = ($colorIndex + 1) % scalar(@ngramColors); + while($ngram->[0] > $curCol || $ngram->[3] > $curRow) {$html .= ""; $curCol = ($curCol + 1) % $numWords; if($curCol == 0) {$html .= ""; $curRow++;}} + $html .= ""; $curCol = ($curCol + $ngram->[1]) % $numWords; if($curCol == 0) {$html .= ""; $curRow++;} } $html .= ""; @@ -349,15 +436,12 @@ sub rowIsClear return (maxN(@{$row}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) == -1) ? 1 : 0; } -#arguments: array of numeric values -#return: max value, or empty list if input list is empty -sub maxN +#auxiliary to getAllNgramsHTML() +#arguments: number of reference translations matching the n-gram, total number of references +#return: HTML color string +sub getNgramColorHTML { - if(scalar(@_) == 0) {return ();} - my $max = $_[0]; - for(my $i = 1; $i < scalar(@_); $i++) - { - if($_[$i] > $max) {$max = $_[$i];} - } - return $max; + my ($matches, $total) = @_; + if($total == 1) {return $ngramSingleRefColor;} + return $ngramMultirefColors[round($matches / $total * (scalar(@ngramMultirefColors) - 1))]; } diff --git a/scripts/analysis/smtgui/README b/scripts/analysis/smtgui/README index c86cd9c1a..e6bcabb2e 100644 --- a/scripts/analysis/smtgui/README +++ b/scripts/analysis/smtgui/README @@ -29,3 +29,14 @@ $ $DATA/test/combine-features.perl CORPUS lc+pos lemma > CORPUS.lc+pos+lemma $ rm CORPUS.pos-tmp (cleanup) where $BIN=/export/ws06osmt/bin, $DATA=/export/ws06osmt/data. + +To get German POS tags and lemmas from a words-only corpus (the first step must be run on linux): + +$ $BIN/recase.perl --in CORPUS.lc --model $MODELS/en-de/recaser/pharaoh.ini > CORPUS.recased (call pharaoh with a lowercase->uppercase model) +$ $BIN/run-lopar-tagger-lowercase.perl CORPUS.recased CORPUS.recased.lopar (call LOPAR) +$ $DATA/test/factor-stem.de.perl < CORPUS.recased.lopar > CORPUS.stem +$ $BIN/lowercase.latin1.perl < CORPUS.stem > CORPUS.lcstem (as you might guess, assumes latin-1 encoding) +$ $DATA/test/factor-pos.de.perl < CORPUS.recased.lopar > CORPUS.pos +$ $DATA/test/combine-features.perl CORPUS lc pos lcstem > CORPUS.lc+pos+lcstem + +where $MODELS=/export/ws06osmt/models. diff --git a/scripts/analysis/smtgui/file-descriptions b/scripts/analysis/smtgui/file-descriptions index caf1507e6..5c1f9153f 100644 --- a/scripts/analysis/smtgui/file-descriptions +++ b/scripts/analysis/smtgui/file-descriptions @@ -1,3 +1,4 @@ devtest2006.de-en.matrix05-baseline.pharaoh Pharaoh JHUWS baseline run devtest2006.de-en.matrix05-baseline.moses-2006-07-20 Moses baseline run devtest2006.en-de.matrix05-baseline.pharaoh Pharaoh JHUWS baseline run +devtest2006.en-de.matrix05-moses.2006-08-02 Moses baseline run diff --git a/scripts/analysis/smtgui/file-factors b/scripts/analysis/smtgui/file-factors index fad031ec4..7938e9297 100644 --- a/scripts/analysis/smtgui/file-factors +++ b/scripts/analysis/smtgui/file-factors @@ -4,3 +4,5 @@ devtest2006.de-en : surf pos lemma : surf europarl.de.srilm.gz : surf europarl.e devtest2006.en-de : surf pos lemma : surf europarl.en.srilm.gz : surf europarl.de.srilm.gz #pstem: lemmas come from the Porter stemmer (and so are really a mix of stems and lemmas) pstem_devtest2006.de-en : surf pos lemma : : surf europarl.en.srilm.gz +#replace esset with ss in German text +ss_devtest2006.en-de : surf pos lemma : surf europarl.en.srilm.gz : surf ss_europarl.de.srilm.gz -- cgit v1.2.3
" . join("", map {$_->[$factorIndex]} @$sentence) . "
" . join(' ', map {$_->[$factorIndex]} @{$sentence}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) . "
[2]}), $numTruths) . "\">" . join(' ', map {$_->[$factorIndex]} @{$sentence}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) . "