diff options
author | alvations <alvations@gmail.com> | 2019-01-03 15:51:27 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-01-03 15:51:27 +0300 |
commit | 8fdbc74bbf9253fd0e442231d18762e5c67213b4 (patch) | |
tree | fc8ce75ec13a4e952e022f40c649e07958570217 | |
parent | dfbb17e549d4cb4ece452c7224ae47a590b7a4da (diff) |
Reverting split_xml()
-rwxr-xr-x | scripts/recaser/train-truecaser.perl | 16 |
1 files changed, 2 insertions, 14 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 589ee43e3..94ddbf2fa 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -103,20 +103,8 @@ sub split_xml { while($line =~ /\S/) { # XML tag if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { - my $potential_xml = $1; - my $line_next = $2; - # exception for factor that is an XML tag - if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) { - $WORD[$i-1] .= $potential_xml; - if ($line_next =~ /^(\|+)(.*)$/) { - $WORD[$i-1] .= $1; - $line_next = $2; - } - } - else { - $MARKUP[$i] .= $potential_xml." "; - } - $line = $line_next; + $MARKUP[$i] .= $1." "; + $line = $2; } # non-XML text elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { |