Welcome to mirror list, hosted at ThFree Co, Russian Federation.

trainlm-irst.perl « generic « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 71f6e08cf3666bcb2933b4e04002d9abbd2ea5ba (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/perl -w

# Compatible with sri LM-creating script, eg.
#    ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
# To use it in the EMS, add this to the [LM] section
#    lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irst-dir"
#    settings = ""
# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section. 
# It should point to the root of the LM toolkit, eg
#    irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
# And make sure that $cores is defined, eg $cores = 8
# And make sure the $settings variable is empty. This script doesn't understand some of the sri args like -unk and will complain.

use strict;
use FindBin qw($Bin);
use Getopt::Long;

my $order = 3;
my $corpusPath;
my $lmPath;
my $cores = 2;
my $irstPath;
my $tempPath = "tmp";
my $temp;

GetOptions("order=s"  => \$order,
           "text=s"   => \$corpusPath,
           "lm=s"     => \$lmPath,
           "cores=s"  => \$cores,
           "irst-dir=s"  => \$irstPath,
           "temp-dir=s"  => \$tempPath,
	   "interpolate!" => \$temp,  #ignore
	   "kndiscount!" => \$temp    #ignore
	   ) or exit 1;

#die("ERROR: please set order") unless defined($order);
die("ERROR: please set text") unless defined($corpusPath);
die("ERROR: please set lm") unless defined($lmPath);
die("ERROR: please set irst-dir") unless defined($irstPath);

my $ext = ($corpusPath =~ m/([^.]+)$/)[0];
print "extension is $ext\n";

$tempPath .= "/irstlm-build-tmp.$$";
`mkdir -p $tempPath`;

my $cmd;
if ($ext eq "gz")
{
    $cmd = "zcat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
}
else
{
    $cmd = "cat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
}
print STDERR "EXECUTING $cmd\n";
`$cmd`;

$cmd = "IRSTLM=$irstPath/.. $irstPath/build-lm.sh -t $tempPath/stat4 -i \"gunzip -c $tempPath/monolingual.setagged.gz\" -n $order -p -o $tempPath/iarpa.gz -k $cores";
print STDERR "EXECUTING $cmd\n";
`$cmd`;

$ext = ($lmPath =~ m/([^.]+)$/)[0];
print "extension is $ext\n";

if ($ext eq "gz")
{
    $cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
}
else
{
    $cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes $lmPath";
}

print STDERR "EXECUTING $cmd\n";
`$cmd`;

$cmd = "rm -rf $tempPath";
print STDERR "EXECUTING $cmd\n";
`$cmd`;

print STDERR "FINISH.\n";