Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2007-03-26 09:44:27 +0400
committerbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2007-03-26 09:44:27 +0400
commit3d288d81e4cd4da62b976b2aaaabb63576d582a2 (patch)
treeb1a9eabfcffec9f79e48c15590f812386ab26dfa /scripts/recaser
parentb8a0761af387511c229368f50740a40f94eb8fb1 (diff)
Proper unicode-based lower and uppercasing.
Added language option to recase.perl, English remains the default. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1326 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/recaser')
-rwxr-xr-xscripts/recaser/recase.perl29
-rwxr-xr-xscripts/recaser/train-recaser.perl3
2 files changed, 24 insertions, 8 deletions
diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl
index a6fc82133..7a173c7b5 100755
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@@ -6,17 +6,25 @@ use Getopt::Long "GetOptions";
my ($SRC,$INFILE,$RECASE_MODEL);
my $MOSES = "moses";
+my $LANGUAGE = "en"; # English by default;
die("recase.perl --in file --model ini-file > out")
unless &GetOptions('in=s' => \$INFILE,
'headline=s' => \$SRC,
+ 'lang=s' => \$LANGUAGE,
'moses=s' => \$MOSES,
'model=s' => \$RECASE_MODEL)
&& defined($INFILE)
&& defined($RECASE_MODEL);
+my %treated_languages = map { ($_,1) } qw/en cs/;
+die "I don't know any rules for $LANGUAGE. Use 'en' as the default."
+ if ! defined $treated_languages{$LANGUAGE};
+
# lowercase even in headline
my %ALWAYS_LOWER;
-foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }
+if ($LANGUAGE eq "en" ) {
+ foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }
+}
# find out about the headlines
my @HEADLINE;
@@ -32,10 +40,13 @@ if (defined($SRC)) {
close(SRC);
}
+binmode(STDOUT, ":utf8");
+
my $sentence = 0;
my $infile = $INFILE;
$infile =~ s/[\.\/]/_/g;
open(MODEL,"$MOSES -f $RECASE_MODEL -i $INFILE -dl 1|");
+binmode(MODEL, ":utf8");
while(<MODEL>) {
chomp;
s/\s+$//;
@@ -44,11 +55,14 @@ while(<MODEL>) {
# uppercase initial word
&uppercase(\$WORD[0]);
- # uppercase after period
- for(my $i=1;$i<scalar(@WORD);$i++) {
- if ($WORD[$i-1] eq '.') {
- &uppercase(\$WORD[$i]);
- }
+ if ($LANGUAGE ne "cs") {
+ # uppercase after period
+ # unless in Czech where '.' is used after all ordinals
+ for(my $i=1;$i<scalar(@WORD);$i++) {
+ if ($WORD[$i-1] eq '.') {
+ &uppercase(\$WORD[$i]);
+ }
+ }
}
# uppercase headlines {
@@ -74,6 +88,5 @@ close(MODEL);
sub uppercase {
my ($W) = @_;
- substr($$W,0,1) =~ tr/a-z/A-Z/;
- substr($$W,0,1) =~ tr/à-þ/À-Þ/;
+ $$W = ucfirst($$W); # rely on Perl's Unicode knowledge, never use tr//
}
diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl
index 10e2c008d..cdf31a7b1 100755
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@@ -54,9 +54,12 @@ sub train_lm {
sub prepare_data {
print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
open(CORPUS,$CORPUS);
+ binmode(CORPUS, ":utf8");
open(CASED,">$DIR/aligned.cased");
+ binmode(CASED, ":utf8");
print "$DIR/aligned.lowercased\n";
open(LOWERCASED,">$DIR/aligned.lowercased");
+ binmode(LOWERCASED, ":utf8");
open(ALIGNMENT,">$DIR/aligned.a");
while(<CORPUS>) {
next if length($_)>2000;