Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2009-02-09 18:32:34 +0300
committerphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2009-02-09 18:32:34 +0300
commita62f8ee31630caf230100ff5e34e773d94afcf43 (patch)
treede024ff1a6d32b0af3e5dc2cd86f441d91dcb5a3 /scripts/recaser/train-truecaser.perl
parent63effe85b514d8c1541b64909d90bf894e67fdb5 (diff)
added truecaser
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2112 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/recaser/train-truecaser.perl')
-rwxr-xr-xscripts/recaser/train-truecaser.perl49
1 files changed, 49 insertions, 0 deletions
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
new file mode 100755
index 000000000..603ccac21
--- /dev/null
+++ b/scripts/recaser/train-truecaser.perl
@@ -0,0 +1,49 @@
+#!/usr/bin/perl -w
+
+# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
+use strict;
+use Getopt::Long "GetOptions";
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+# apply switches
+my ($MODEL,$CORPUS);
+die("train-recaser.perl --model recaser --corpus cased")
+ unless &GetOptions('corpus=s' => \$CORPUS,
+ 'model=s' => \$MODEL) && defined($CORPUS) && defined($MODEL);
+my %CASING;
+my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
+my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);
+open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'");
+while(<CORPUS>) {
+ chop;
+ my @WORD = split;
+ my $start = 0;
+ while($start<=$#WORD && defined($DELAYED_SENTENCE_START{$WORD[$start]})) { $start++; }
+ for(my $i=$start+1;$i<=$#WORD;$i++) {
+ if (! defined($SENTENCE_END{$WORD[$i-1]})) {
+ $CASING{ lc($WORD[$i]) }{ $WORD[$i] }++;
+ }
+ }
+}
+close(CORPUS);
+
+open(MODEL,">$MODEL") || die("ERROR: could not create '$MODEL'");
+foreach my $type (keys %CASING) {
+ my ($score,$total,$best) = (-1,0,"");
+ foreach my $word (keys %{$CASING{$type}}) {
+ my $count = $CASING{$type}{$word};
+ $total += $count;
+ if ($count > $score) {
+ $best = $word;
+ $score = $count;
+ }
+ }
+ print MODEL "$best ($score/$total)";
+ foreach my $word (keys %{$CASING{$type}}) {
+ print MODEL " $word ($CASING{$type}{$word})" unless $word eq $best;
+ }
+ print MODEL "\n";
+}
+close(MODEL);