Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOndrej Bojar <bojar@ufal.mff.cuni.cz>2017-04-07 18:28:13 +0300
committerOndrej Bojar <bojar@ufal.mff.cuni.cz>2017-04-07 18:28:13 +0300
commitd9faf8f901a477d96af13546d15d9686f2d350ac (patch)
treee73d928a2f5cc8ccb2ce62fc083220daee8875e2
parent1d650061900bd1d06aa1aa8e5bd706c24682982e (diff)
ignore words where there is nothing to case
-rwxr-xr-xscripts/recaser/train-truecaser.perl6
1 files changed, 6 insertions, 0 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index 4f600a640..94ddbf2fa 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -44,6 +44,12 @@ while(<CORPUS>) {
$firstWordOfSentence = 1;
}
+ if ($currentWord !~ /[\p{Ll}\p{Lu}\p{Lt}]/) {
+ # skip words with nothing to case
+ $firstWordOfSentence = 0;
+ next;
+ }
+
my $currentWordWeight = 0;
if (! $firstWordOfSentence) {
$currentWordWeight = 1;