Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRico Sennrich <rico.sennrich@gmx.ch>2014-02-08 19:43:00 +0400
committerRico Sennrich <rico.sennrich@gmx.ch>2014-02-08 19:43:00 +0400
commitee06a0f6522d7cdc6bd834032c221e66ab70d841 (patch)
tree63d0c754d7ad9f475471cf3cf39e4e47645d327d /scripts/recaser/train-truecaser.perl
parent138947d9ab8088ce54aa51858c194887e2c987e3 (diff)
don't complain if input contains non-escaped '<' or '>', but is not XML
Diffstat (limited to 'scripts/recaser/train-truecaser.perl')
-rwxr-xr-xscripts/recaser/train-truecaser.perl8
1 files changed, 8 insertions, 0 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index 8a1ba4c76..59a83ec91 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -86,15 +86,23 @@ sub split_xml {
my $i = 0;
$MARKUP[0] = "";
while($line =~ /\S/) {
+ # XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
$MARKUP[$i] .= $1." ";
$line = $2;
}
+ # non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
+ # '<' or '>' occurs in word, but it's not an XML tag
+ elsif ($line =~ /^\s*(\S+)(.*)$/) {
+ $WORD[$i++] = $1;
+ $MARKUP[$i] = "";
+ $line = $2;
+ }
else {
die("ERROR: huh? $line\n");
}