diff options
author | alvations <alvations@gmail.com> | 2018-12-20 06:53:02 +0300 |
---|---|---|
committer | alvations <alvations@gmail.com> | 2018-12-20 06:53:02 +0300 |
commit | 40748e528d8238321f4c2864a9d3ed5fa90b59fa (patch) | |
tree | 6b00009ce5ef0a9b3dd7e6ec2c6449d46a21ad96 | |
parent | 413ba6b583465a7e7727166341fe5eaef89c6bf5 (diff) |
split_xml should be consistent for training and using
-rwxr-xr-x | scripts/recaser/train-truecaser.perl | 16 |
1 files changed, 14 insertions, 2 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 94ddbf2fa..589ee43e3 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -103,8 +103,20 @@ sub split_xml { while($line =~ /\S/) { # XML tag if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { - $MARKUP[$i] .= $1." "; - $line = $2; + my $potential_xml = $1; + my $line_next = $2; + # exception for factor that is an XML tag + if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) { + $WORD[$i-1] .= $potential_xml; + if ($line_next =~ /^(\|+)(.*)$/) { + $WORD[$i-1] .= $1; + $line_next = $2; + } + } + else { + $MARKUP[$i] .= $potential_xml." "; + } + $line = $line_next; } # non-XML text elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { |