Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230>2010-11-09 14:05:23 +0300
committerbgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230>2010-11-09 14:05:23 +0300
commit518035ed0599ba15d4a3f444b06c8b01116951b4 (patch)
treea2c777cd4eb71647b375c779e2a2ac483da0c965 /scripts/recaser/train-truecaser.perl
parent9a72825d29869e94d505e9a86607190928b8de64 (diff)
add --possiblyUseFirstToken option, which, when selected, allows certain sentence-initial tokens to be taken into account. See comment in header or support mailing list discussion.
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3690 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/recaser/train-truecaser.perl')
-rwxr-xr-xscripts/recaser/train-truecaser.perl41
1 files changed, 36 insertions, 5 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index 2250b9172..25d79e551 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -1,14 +1,23 @@
#!/usr/bin/perl -w
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
+
+#
+# Options:
+#
+# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
+#
+
use strict;
use Getopt::Long "GetOptions";
# apply switches
my ($MODEL,$CORPUS);
-die("train-truecaser.perl --model truecaser --corpus cased")
+die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstToken]")
unless &GetOptions('corpus=s' => \$CORPUS,
- 'model=s' => \$MODEL) && defined($CORPUS) && defined($MODEL);
+ 'model=s' => \$MODEL,
+ 'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0))
+ && defined($CORPUS) && defined($MODEL);
my %CASING;
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);
@@ -19,10 +28,32 @@ while(<CORPUS>) {
my @WORD = split;
my $start = 0;
while($start<=$#WORD && defined($DELAYED_SENTENCE_START{$WORD[$start]})) { $start++; }
- for(my $i=$start+1;$i<=$#WORD;$i++) {
- if (! defined($SENTENCE_END{$WORD[$i-1]})) {
- $CASING{ lc($WORD[$i]) }{ $WORD[$i] }++;
+ my $firstWordOfSentence = 1;
+ for(my $i=$start;$i<=$#WORD;$i++) {
+ my $currentWord = $WORD[$i];
+ if (! $firstWordOfSentence && defined($SENTENCE_END{$WORD[$i-1]})) {
+ $firstWordOfSentence = 1;
+ }
+
+ my $currentWordWeight = 0;
+ if (! $firstWordOfSentence) {
+ $currentWordWeight = 1;
+ } elsif ($possiblyUseFirstToken) {
+ # gated special handling of first word of sentence
+ my $firstChar = substr($currentWord, 0, 1);
+ if (lc($firstChar) eq $firstChar) {
+ # if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial)
+ $currentWordWeight = 1;
+ } elsif (scalar(@WORD) == 1) {
+ # if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case)
+ $currentWordWeight = 0.1;
+ }
+ }
+ if ($currentWordWeight > 0) {
+ $CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight;
}
+
+ $firstWordOfSentence = 0;
}
}
close(CORPUS);