add --possiblyUseFirstToken option, which, when selected, allows certain sentence-initial tokens to be taken into account. See comment in header or support mailing list discussion.

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3690 1f5c12ca-751b-0410-a591-d2e778427230
author: bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230> 2010-11-09 14:05:23 +0300
committer: bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230> 2010-11-09 14:05:23 +0300
commit: 518035ed0599ba15d4a3f444b06c8b01116951b4 (patch)
tree: a2c777cd4eb71647b375c779e2a2ac483da0c965 /scripts/recaser/train-truecaser.perl
parent: 9a72825d29869e94d505e9a86607190928b8de64 (diff)
1 files changed, 36 insertions, 5 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index 2250b9172..25d79e551 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -1,14 +1,23 @@
 #!/usr/bin/perl -w
 
 # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
+
+#
+# Options:
+#
+# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
+#
+
 use strict;
 use Getopt::Long "GetOptions";
 
 # apply switches
 my ($MODEL,$CORPUS);
-die("train-truecaser.perl --model truecaser --corpus cased")
+die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstToken]")
     unless &GetOptions('corpus=s' => \$CORPUS,
-                       'model=s' => \$MODEL) && defined($CORPUS) && defined($MODEL);
+                       'model=s' => \$MODEL,
+                       'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0))
+    && defined($CORPUS) && defined($MODEL);
 my %CASING;
 my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
 my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);
@@ -19,10 +28,32 @@ while(<CORPUS>) {
   my @WORD = split;
   my $start = 0;
   while($start<=$#WORD && defined($DELAYED_SENTENCE_START{$WORD[$start]})) { $start++; }
-  for(my $i=$start+1;$i<=$#WORD;$i++) {
-    if (! defined($SENTENCE_END{$WORD[$i-1]})) {
-      $CASING{ lc($WORD[$i]) }{ $WORD[$i] }++;
+  my $firstWordOfSentence = 1;
+  for(my $i=$start;$i<=$#WORD;$i++) {
+    my $currentWord = $WORD[$i];
+    if (! $firstWordOfSentence && defined($SENTENCE_END{$WORD[$i-1]})) {
+      $firstWordOfSentence = 1;
+    }
+
+    my $currentWordWeight = 0;
+    if (! $firstWordOfSentence) {
+      $currentWordWeight = 1;
+    } elsif ($possiblyUseFirstToken) {
+      # gated special handling of first word of sentence
+      my $firstChar = substr($currentWord, 0, 1);
+      if (lc($firstChar) eq $firstChar) {
+        # if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial)
+	$currentWordWeight = 1;
+      } elsif (scalar(@WORD) == 1) {
+	# if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case)
+	$currentWordWeight = 0.1;
+      }
+    }
+    if ($currentWordWeight > 0) {
+      $CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight;
     }
+
+    $firstWordOfSentence = 0;
   }
 }
 close(CORPUS);
author	bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230>	2010-11-09 14:05:23 +0300
committer	bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230>	2010-11-09 14:05:23 +0300
commit	518035ed0599ba15d4a3f444b06c8b01116951b4 (patch)
tree	a2c777cd4eb71647b375c779e2a2ac483da0c965 /scripts/recaser/train-truecaser.perl
parent	9a72825d29869e94d505e9a86607190928b8de64 (diff)