diff options
author | Ondrej Bojar <bojar@ufal.mff.cuni.cz> | 2017-04-07 18:28:13 +0300 |
---|---|---|
committer | Ondrej Bojar <bojar@ufal.mff.cuni.cz> | 2017-04-07 18:28:13 +0300 |
commit | d9faf8f901a477d96af13546d15d9686f2d350ac (patch) | |
tree | e73d928a2f5cc8ccb2ce62fc083220daee8875e2 | |
parent | 1d650061900bd1d06aa1aa8e5bd706c24682982e (diff) |
ignore words where there is nothing to case
-rwxr-xr-x | scripts/recaser/train-truecaser.perl | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 4f600a640..94ddbf2fa 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -44,6 +44,12 @@ while(<CORPUS>) { $firstWordOfSentence = 1; } + if ($currentWord !~ /[\p{Ll}\p{Lu}\p{Lt}]/) { + # skip words with nothing to case + $firstWordOfSentence = 0; + next; + } + my $currentWordWeight = 0; if (! $firstWordOfSentence) { $currentWordWeight = 1; |