diff options
author | eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> | 2006-08-16 00:17:41 +0400 |
---|---|---|
committer | eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> | 2006-08-16 00:17:41 +0400 |
commit | 486f88157fa462c5924a82a320e3d369b9a36380 (patch) | |
tree | d5bedb6b15a0ec742370711970571db9d4d86295 /scripts/analysis | |
parent | 2a3037692b18438e3901948d7a516048aadae5ec (diff) |
add formatting for sentence strings to make token comparison more accurate
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@761 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-x | scripts/analysis/sentence-by-sentence.pl | 14 |
1 files changed, 14 insertions, 0 deletions
diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl index e4d824fad..d5ca9879c 100755 --- a/scripts/analysis/sentence-by-sentence.pl +++ b/scripts/analysis/sentence-by-sentence.pl @@ -47,6 +47,7 @@ my @ngramMultirefColors = ('#ff9999', '#ff9933', '#ffff99', '#a0a0ff', '#99ff99' my $i = 0; while(my $sLine = <SYSOUT>) { + escapeMetachars($sLine); #remove inconsistencies in encoding my @sFactors = @{extractFactorArrays($sLine)}; my @eLines = () x scalar(@truthfiles); my @eFactors; @@ -54,12 +55,14 @@ while(my $sLine = <SYSOUT>) { my $fh = $TRUTHS[$j]; $eLines[$j] = <$fh>; + escapeMetachars($eLines[$j]); #remove inconsistencies in encoding push @eFactors, extractFactorArrays($eLines[$j], "$truthfiles[$j] shorter than $sysoutfile"); } my $sourceFactors; if (defined $sourcefile) { my $sourceLine = <SOURCE>; + escapeMetachars($sourceLine); #remove inconsistencies in encoding $sourceFactors = extractFactorArrays($sourceLine, "$sourcefile shorter than $sysoutfile"); } @@ -207,6 +210,17 @@ sub round return ($x - int($x) < .5) ? int($x) : int($x) + 1; } +#escape HTML metacharacters for display purposes and to allow for consistent string comparison +#arguments: string to be formatted +#return: none +sub escapeMetachars +{ + my $str = shift; + $str =~ s/&\s+/& /; + $str =~ s/<\s+/< /; + $str =~ s/>\s+/> /; +} + ############################################################################################################################################################### #arguments: line read from corpus file, (optionally) string to die with if line isn't defined (default die-msg is empty) |