Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xscripts/recaser/train-truecaser.perl16
1 files changed, 14 insertions, 2 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index 94ddbf2fa..589ee43e3 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -103,8 +103,20 @@ sub split_xml {
while($line =~ /\S/) {
# XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
- $MARKUP[$i] .= $1." ";
- $line = $2;
+ my $potential_xml = $1;
+ my $line_next = $2;
+ # exception for factor that is an XML tag
+ if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
+ $WORD[$i-1] .= $potential_xml;
+ if ($line_next =~ /^(\|+)(.*)$/) {
+ $WORD[$i-1] .= $1;
+ $line_next = $2;
+ }
+ }
+ else {
+ $MARKUP[$i] .= $potential_xml." ";
+ }
+ $line = $line_next;
}
# non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {