From d9faf8f901a477d96af13546d15d9686f2d350ac Mon Sep 17 00:00:00 2001 From: Ondrej Bojar Date: Fri, 7 Apr 2017 17:28:13 +0200 Subject: ignore words where there is nothing to case --- scripts/recaser/train-truecaser.perl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 4f600a640..94ddbf2fa 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -44,6 +44,12 @@ while() { $firstWordOfSentence = 1; } + if ($currentWord !~ /[\p{Ll}\p{Lu}\p{Lt}]/) { + # skip words with nothing to case + $firstWordOfSentence = 0; + next; + } + my $currentWordWeight = 0; if (! $firstWordOfSentence) { $currentWordWeight = 1; -- cgit v1.2.3