diff options
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-x | scripts/analysis/sentence-by-sentence.pl | 14 |
1 files changed, 14 insertions, 0 deletions
diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl index e4d824fad..d5ca9879c 100755 --- a/scripts/analysis/sentence-by-sentence.pl +++ b/scripts/analysis/sentence-by-sentence.pl @@ -47,6 +47,7 @@ my @ngramMultirefColors = ('#ff9999', '#ff9933', '#ffff99', '#a0a0ff', '#99ff99' my $i = 0; while(my $sLine = <SYSOUT>) { + escapeMetachars($sLine); #remove inconsistencies in encoding my @sFactors = @{extractFactorArrays($sLine)}; my @eLines = () x scalar(@truthfiles); my @eFactors; @@ -54,12 +55,14 @@ while(my $sLine = <SYSOUT>) { my $fh = $TRUTHS[$j]; $eLines[$j] = <$fh>; + escapeMetachars($eLines[$j]); #remove inconsistencies in encoding push @eFactors, extractFactorArrays($eLines[$j], "$truthfiles[$j] shorter than $sysoutfile"); } my $sourceFactors; if (defined $sourcefile) { my $sourceLine = <SOURCE>; + escapeMetachars($sourceLine); #remove inconsistencies in encoding $sourceFactors = extractFactorArrays($sourceLine, "$sourcefile shorter than $sysoutfile"); } @@ -207,6 +210,17 @@ sub round return ($x - int($x) < .5) ? int($x) : int($x) + 1; } +#escape HTML metacharacters for display purposes and to allow for consistent string comparison +#arguments: string to be formatted +#return: none +sub escapeMetachars +{ + my $str = shift; + $str =~ s/&\s+/& /; + $str =~ s/<\s+/< /; + $str =~ s/>\s+/> /; +} + ############################################################################################################################################################### #arguments: line read from corpus file, (optionally) string to die with if line isn't defined (default die-msg is empty) |