Proper unicode-based lower and uppercasing.

Added language option to recase.perl, English remains the default. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1326 1f5c12ca-751b-0410-a591-d2e778427230
author: bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> 2007-03-26 09:44:27 +0400
committer: bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> 2007-03-26 09:44:27 +0400
commit: 3d288d81e4cd4da62b976b2aaaabb63576d582a2 (patch)
tree: b1a9eabfcffec9f79e48c15590f812386ab26dfa /scripts/recaser
parent: b8a0761af387511c229368f50740a40f94eb8fb1 (diff)
2 files changed, 24 insertions, 8 deletions
diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl
index a6fc82133..7a173c7b5 100755
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@@ -6,17 +6,25 @@ use Getopt::Long "GetOptions";
 
 my ($SRC,$INFILE,$RECASE_MODEL);
 my $MOSES = "moses";
+my $LANGUAGE = "en"; # English by default;
 die("recase.perl --in file --model ini-file > out")
     unless &GetOptions('in=s' => \$INFILE,
                        'headline=s' => \$SRC,
+                       'lang=s' => \$LANGUAGE,
 		       'moses=s' => \$MOSES,
                        'model=s' => \$RECASE_MODEL)
     && defined($INFILE)
     && defined($RECASE_MODEL);
 
+my %treated_languages = map { ($_,1) } qw/en cs/;
+die "I don't know any rules for $LANGUAGE. Use 'en' as the default."
+  if ! defined $treated_languages{$LANGUAGE};
+
 # lowercase even in headline
 my %ALWAYS_LOWER;
-foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }
+if ($LANGUAGE eq "en" ) {
+  foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }
+}
 
 # find out about the headlines
 my @HEADLINE;
@@ -32,10 +40,13 @@ if (defined($SRC)) {
     close(SRC);
 }
 
+binmode(STDOUT, ":utf8");
+
 my $sentence = 0;
 my $infile = $INFILE;
 $infile =~ s/[\.\/]/_/g;
 open(MODEL,"$MOSES -f $RECASE_MODEL -i $INFILE -dl 1|");
+binmode(MODEL, ":utf8");
 while(<MODEL>) {
     chomp;
     s/\s+$//;
@@ -44,11 +55,14 @@ while(<MODEL>) {
     # uppercase initial word
     &uppercase(\$WORD[0]);
 
-    # uppercase after period
-    for(my $i=1;$i<scalar(@WORD);$i++) {
-	if ($WORD[$i-1] eq '.') {
-	    &uppercase(\$WORD[$i]);
-	}
+    if ($LANGUAGE ne "cs") {
+      # uppercase after period
+      # unless in Czech where '.' is used after all ordinals
+      for(my $i=1;$i<scalar(@WORD);$i++) {
+	  if ($WORD[$i-1] eq '.') {
+	      &uppercase(\$WORD[$i]);
+	  }
+      }
     }
 
     # uppercase headlines {
@@ -74,6 +88,5 @@ close(MODEL);
 
 sub uppercase {
     my ($W) = @_;
-    substr($$W,0,1) =~ tr/a-z/A-Z/;
-    substr($$W,0,1) =~ tr/�-�/�-�/;
+    $$W = ucfirst($$W); # rely on Perl's Unicode knowledge, never use tr//
 }
diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl
index 10e2c008d..cdf31a7b1 100755
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@@ -54,9 +54,12 @@ sub train_lm {
 sub prepare_data {
     print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
     open(CORPUS,$CORPUS);
+    binmode(CORPUS, ":utf8");
     open(CASED,">$DIR/aligned.cased");
+    binmode(CASED, ":utf8");
     print "$DIR/aligned.lowercased\n";
     open(LOWERCASED,">$DIR/aligned.lowercased");
+    binmode(LOWERCASED, ":utf8");
     open(ALIGNMENT,">$DIR/aligned.a");
     while(<CORPUS>) {
 	next if length($_)>2000;
author	bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>	2007-03-26 09:44:27 +0400
committer	bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>	2007-03-26 09:44:27 +0400
commit	3d288d81e4cd4da62b976b2aaaabb63576d582a2 (patch)
tree	b1a9eabfcffec9f79e48c15590f812386ab26dfa /scripts/recaser
parent	b8a0761af387511c229368f50740a40f94eb8fb1 (diff)