Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-16 00:17:41 +0400
committereherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-16 00:17:41 +0400
commit486f88157fa462c5924a82a320e3d369b9a36380 (patch)
treed5bedb6b15a0ec742370711970571db9d4d86295 /scripts/analysis
parent2a3037692b18438e3901948d7a516048aadae5ec (diff)
add formatting for sentence strings to make token comparison more accurate
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@761 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-xscripts/analysis/sentence-by-sentence.pl14
1 files changed, 14 insertions, 0 deletions
diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl
index e4d824fad..d5ca9879c 100755
--- a/scripts/analysis/sentence-by-sentence.pl
+++ b/scripts/analysis/sentence-by-sentence.pl
@@ -47,6 +47,7 @@ my @ngramMultirefColors = ('#ff9999', '#ff9933', '#ffff99', '#a0a0ff', '#99ff99'
my $i = 0;
while(my $sLine = <SYSOUT>)
{
+ escapeMetachars($sLine); #remove inconsistencies in encoding
my @sFactors = @{extractFactorArrays($sLine)};
my @eLines = () x scalar(@truthfiles);
my @eFactors;
@@ -54,12 +55,14 @@ while(my $sLine = <SYSOUT>)
{
my $fh = $TRUTHS[$j];
$eLines[$j] = <$fh>;
+ escapeMetachars($eLines[$j]); #remove inconsistencies in encoding
push @eFactors, extractFactorArrays($eLines[$j], "$truthfiles[$j] shorter than $sysoutfile");
}
my $sourceFactors;
if (defined $sourcefile)
{
my $sourceLine = <SOURCE>;
+ escapeMetachars($sourceLine); #remove inconsistencies in encoding
$sourceFactors = extractFactorArrays($sourceLine, "$sourcefile shorter than $sysoutfile");
}
@@ -207,6 +210,17 @@ sub round
return ($x - int($x) < .5) ? int($x) : int($x) + 1;
}
+#escape HTML metacharacters for display purposes and to allow for consistent string comparison
+#arguments: string to be formatted
+#return: none
+sub escapeMetachars
+{
+ my $str = shift;
+ $str =~ s/&\s+/&amp; /;
+ $str =~ s/<\s+/&lt; /;
+ $str =~ s/>\s+/&gt; /;
+}
+
###############################################################################################################################################################
#arguments: line read from corpus file, (optionally) string to die with if line isn't defined (default die-msg is empty)