Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoralvations <alvations@gmail.com>2018-12-20 06:53:02 +0300
committeralvations <alvations@gmail.com>2018-12-20 06:53:02 +0300
commit40748e528d8238321f4c2864a9d3ed5fa90b59fa (patch)
tree6b00009ce5ef0a9b3dd7e6ec2c6449d46a21ad96
parent413ba6b583465a7e7727166341fe5eaef89c6bf5 (diff)
split_xml should be consistent for training and using
-rwxr-xr-xscripts/recaser/train-truecaser.perl16
1 files changed, 14 insertions, 2 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index 94ddbf2fa..589ee43e3 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -103,8 +103,20 @@ sub split_xml {
while($line =~ /\S/) {
# XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
- $MARKUP[$i] .= $1." ";
- $line = $2;
+ my $potential_xml = $1;
+ my $line_next = $2;
+ # exception for factor that is an XML tag
+ if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
+ $WORD[$i-1] .= $potential_xml;
+ if ($line_next =~ /^(\|+)(.*)$/) {
+ $WORD[$i-1] .= $1;
+ $line_next = $2;
+ }
+ }
+ else {
+ $MARKUP[$i] .= $potential_xml." ";
+ }
+ $line = $line_next;
}
# non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {