scripts/recaser/train-recaser.perl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220

#!/usr/bin/env perl

# $Id$
use warnings;
use strict;
use FindBin qw($Bin);
use Getopt::Long "GetOptions";

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");

# apply switches
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
my $LM = "KENLM"; # KENLM is default.
my $BUILD_LM = "build-lm.sh";
my $BUILD_KENLM = "$Bin/../../bin/lmplz";
my $BUILD_BINARY = "$Bin/../../bin/build_binary";
my $EXTRACT = "$Bin/../../bin/extract";
my $SCORE = "$Bin/../../bin/score";
my $CONSOLIDATE_DIRECT = "$Bin/../../bin/consolidate-direct";
my $NGRAM_COUNT = "ngram-count";
my $TRAIN_SCRIPT = "$Bin/../training/train-model.perl";
my $MAX_LEN = 1;
my $FIRST_STEP = 1;
my $LAST_STEP = 11;
$ERROR = "training Aborted."
    unless &GetOptions('first-step=i' => \$FIRST_STEP,
                       'last-step=i' => \$LAST_STEP,
                       'corpus=s' => \$CORPUS,
                       'config=s' => \$CONFIG,
                       'dir=s' => \$DIR,
                       'ngram-count=s' => \$NGRAM_COUNT,
                       'build-lm=s' => \$BUILD_LM,
                       'build-kenlm=s' => \$BUILD_KENLM,
                       'lm=s' => \$LM,
                       'train-script=s' => \$TRAIN_SCRIPT,
                       'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
                       'max-len=i' => \$MAX_LEN,
                       'help' => \$HELP);

# check and set default to unset parameters
$ERROR = "please specify working dir --dir" unless defined($DIR) || defined($HELP);
$ERROR = "please specify --corpus" if !defined($CORPUS) && !defined($HELP)
                                  && $FIRST_STEP <= 2 && $LAST_STEP >= 1;

if ($HELP || $ERROR) {
    if ($ERROR) {
        print STDERR "ERROR: " . $ERROR . "\n";
    }
    print STDERR "Usage: $0 --dir /output/recaser --corpus /Cased/corpus/files [options ...]";

    print STDERR "\n\nOptions:
  == MANDATORY ==
  --dir=dir                 ... outputted recaser directory.
  --corpus=file             ... inputted cased corpus.

  == OPTIONAL ==
  = Recaser Training configuration =
  --train-script=file       ... path to the train script (default: train-factored-phrase-model.perl in \$PATH).
  --config=config           ... training script configuration.
  --scripts-root-dir=dir    ... scripts directory.
  --max-len=int             ... max phrase length (default: 1).

  = Language Model Training configuration =
  --lm=[IRSTLM,SRILM,KENLM] ... language model (default: KENLM).
  --build-lm=file           ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM).
  --ngram-count=file        ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM).

  = Steps this script will perform =
  (1) Truecasing;
  (2) Language Model Training;
  (3) Data Preparation
  (4-10) Recaser Model Training;
  (11) Cleanup.
  --first-step=[1-11]       ... step where script starts (default: 1).
  --last-step=[1-11]        ... step where script ends (default: 11).

  --help                    ... this usage output.\n";
  if ($ERROR) {
    exit(1);
  }
  else {
    exit(0);
  }
}

# main loop
`mkdir -p $DIR`;
&truecase()           if $FIRST_STEP == 1;
$CORPUS = "$DIR/aligned.truecased" if (-e "$DIR/aligned.truecased");
&train_lm()           if $FIRST_STEP <= 2;
&prepare_data()       if $FIRST_STEP <= 3 && $LAST_STEP >= 3;
&train_recase_model() if $FIRST_STEP <= 10 && $LAST_STEP >= 3;
&cleanup()            if $LAST_STEP == 11;

exit(0);

### subs ###

sub truecase {
    print STDERR "(1) Truecase data @ ".`date`;
    print STDERR "(1) To build model without truecasing, use --first-step 2, and make sure $DIR/aligned.truecased does not exist\n";

    my $cmd = "$Bin/train-truecaser.perl --model $DIR/truecaser_model --corpus $CORPUS";
    print STDERR $cmd."\n";
    system($cmd) == 0 || die("Training truecaser died with error " . ($? >> 8) . "\n");

    $cmd = "$Bin/truecase.perl --model $DIR/truecaser_model < $CORPUS > $DIR/aligned.truecased";
    print STDERR $cmd."\n";
    system($cmd) == 0 || die("Applying truecaser died with error " . ($? >> 8) . "\n");

}

sub train_lm {
    print STDERR "(2) Train language model on cased data @ ".`date`;
    my $cmd = "";
    if (uc $LM eq "IRSTLM") {
        $cmd = "$BUILD_LM -t /tmp -i $CORPUS -n 3 -o $DIR/cased.irstlm.gz";
    }
    elsif (uc $LM eq "SRILM") {
        $LM = "SRILM";
        $cmd = "$NGRAM_COUNT -text $CORPUS -lm $DIR/cased.srilm.gz -interpolate -kndiscount";
    }
    else {
        $LM = "KENLM";
        $cmd = "$BUILD_KENLM --prune 0 0 1 -S 5% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.arpa.gz";
    }
    print STDERR "** Using $LM **" . "\n";
    print STDERR $cmd."\n";
    system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n");
    if ($LM eq "KENLM") {
      system("$BUILD_BINARY $DIR/cased.kenlm.arpa.gz $DIR/cased.kenlm ; rm $DIR/cased.kenlm.arpa.gz");
    }
}

sub prepare_data {
    print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
    open(CORPUS,$CORPUS);
    binmode(CORPUS, ":utf8");
    open(CASED,">$DIR/aligned.cased");
    binmode(CASED, ":utf8");
    print "$DIR/aligned.lowercased\n";
    open(LOWERCASED,">$DIR/aligned.lowercased");
    binmode(LOWERCASED, ":utf8");
    open(ALIGNMENT,">$DIR/aligned.a");
    while(<CORPUS>) {
	next if length($_)>2000;
	s/\x{0}//g;
	s/\|//g;
	s/ +/ /g;
	s/^ //;
	s/ [\r\n]*$/\n/;
	next if /^$/;
	print CASED $_;
	print LOWERCASED lc($_);
	my $i=0;
	foreach (split) {
	    print ALIGNMENT "$i-$i ";
	    $i++;
	}
	print ALIGNMENT "\n";
    }
    close(CORPUS);
    close(CASED);
    close(LOWERCASED);
    close(ALIGNMENT);
}

sub train_recase_model {
    print STDERR "\n(4) Training recasing model @ ".`date`;
    my $first = $FIRST_STEP;
    $first = 4 if $first < 4;
    if ($MAX_LEN == 1) {
       my $cmd = "$EXTRACT $DIR/aligned.cased $DIR/aligned.lowercased $DIR/aligned.a $DIR/extract 1";
       system($cmd) == 0 || die("ERROR: extract (special case max-len 1) failed: $cmd");
       $cmd = "sort -S 2G $DIR/extract > $DIR/extract.sorted";
       system($cmd) == 0 || die("ERROR: sort extract (special case max-len 1) failed: $cmd");
       $cmd = "$SCORE $DIR/extract.sorted /dev/null $DIR/phrase-table-half --NoLex";
       system($cmd) == 0 || die("ERROR: score (special case max-len 1) failed: $cmd");
       $cmd = "$CONSOLIDATE_DIRECT $DIR/phrase-table-half $DIR/phrase-table";
       system($cmd) == 0 || die("ERROR: consolidate-direct (special case max-len 1) failed: $cmd");
       system("rm $DIR/phrase-table-half");
       system("gzip $DIR/phrase-table");
       $first = 9;
    }
    my $cmd = "$TRAIN_SCRIPT --root-dir $DIR --model-dir $DIR --first-step $first --alignment a --corpus $DIR/aligned --f lowercased --e cased --max-phrase-length $MAX_LEN";
    if ($MAX_LEN == 1) {
      $cmd .= " --score-options='--NoLex --OnlyDirect'";
    }
    else {
      $cmd .= " --score-options='--OnlyDirect'";
    }
    if (uc $LM eq "IRSTLM") {
        $cmd .= " --lm 0:3:$DIR/cased.irstlm.gz:1";
    }
    elsif (uc $LM eq "SRILM") {
        $cmd .= " --lm 0:3:$DIR/cased.srilm.gz:8";
    }
    else {
        $cmd .= " --lm 0:3:$DIR/cased.kenlm:8";
    }
    $cmd .= " -config $CONFIG" if $CONFIG;
    print STDERR $cmd."\n";
    system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n");
}

sub cleanup {
    print STDERR "\n(11) Cleaning up @ ".`date`;
    `rm -f $DIR/extract*`;
    my $clean_1 = $?;
    `rm -f $DIR/aligned*`;
    my $clean_2 = $?;
    `rm -f $DIR/lex*`;
    my $clean_3 = $?;
    `rm -f $DIR/truecaser_model`;
    my $clean_4 = $?;
    if ($clean_1 + $clean_2 + $clean_3 + $clean_4 != 0) {
        print STDERR "Training successful but some files could not be cleaned.\n";
    }
}