Welcome to mirror list, hosted at ThFree Co, Russian Federation.

mml-train.perl « support « ems « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: aacf153a70adf9fb83f66fcf14dd182f34337531 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env perl 

use strict;

my ($indomain_source,,$indomain_target,$outdomain_source,$outdomain_target,$lm_training,$lm_binarizer,$order,$lm_settings,$line_count,$model);

use Getopt::Long;
GetOptions('in-source=s' => \$indomain_source,
           'in-target=s' => \$indomain_target,
           'out-source=s' => \$outdomain_source,
           'out-target=s' => \$outdomain_target,
           'model=s' => \$model,
           'lm-training=s' => \$lm_training,
           'lm-binarizer=s' => \$lm_binarizer,
           'order=s' => \$order,
           'lm-settings=s' => \$lm_settings,
           'line-count=i' => \$line_count
    ) or exit(1);

die("ERROR: in-domain source file not specified (-in-source FILE)") unless defined($indomain_source);
die("ERROR: in-domain target file not specified (-in-target FILE)") unless defined($indomain_target);
die("ERROR: out-of-domain source file not specified (-out-source FILE)") unless defined($outdomain_source);
die("ERROR: out-of-domain target file not specified (-out-target FILE)") unless defined($outdomain_target);

die("ERROR: in-domain source file '$indomain_source' not found") unless -e $indomain_source || -e $indomain_source.".gz";
die("ERROR: in-domain target file '$indomain_target' not found") unless -e $indomain_target || -e $indomain_target.".gz";
die("ERROR: out-of-domain source file '$outdomain_source' not found") unless -e $outdomain_source || -e $outdomain_source.".gz";
die("ERROR: out-of-domain target file '$outdomain_target' not found") unless -e $outdomain_target || -e $outdomain_target.".gz";

die("ERROR: language model order not specified (-order NUM)") unless defined($order);
die("ERROR: language model settings not specified (-lm-settings STRING)") unless defined($lm_settings);
die("ERROR: language model command not specified (-lm-training CMD)") unless defined($lm_training);
die("ERROR: language model binarizer not specified (-lm-binarizer CMD)") unless defined($lm_binarizer);
die("ERROR: model not specified (-model FILESTEM)") unless defined($model);

&train_lm($indomain_source,"in-source");
&train_lm($indomain_target,"in-target");
&extract_vocabulary("in-source");
&extract_vocabulary("in-target");
&train_lm($outdomain_source,"out-source","in-source");
&train_lm($outdomain_target,"out-target","in-target");

sub extract_vocabulary {
  my ($type) = @_;
  print STDERR "extracting vocabulary from $type language model\n";
  open(LM,"$model.$type.lm");
  open(VOCAB,">$model.$type.vocab");
  my $unigrams = 0;
  while(<LM>) {
    $unigrams = 1 if /^\\1-grams:/;
    last if /^\\2-grams:/;
    next unless $unigrams;
    my @TOKEN = split(/\s/);
    next unless @TOKEN == 3;
    next if $TOKEN[1] eq '<s>';
    next if $TOKEN[1] eq '<unk>';
    next if $TOKEN[1] eq '<\\s>';
    print VOCAB $TOKEN[1]."\n";
  }
  close(LM);
  close(VOCAB);
}

sub train_lm {
  my ($file,$type,$vocab) = @_;
  print STDERR "training $type language model\n";
  if (defined($line_count)) {
    my $cmd = (-e $file.".gz" ? "zcat $file.gz" : "cat $file");
    $cmd .= " | shuf -n $line_count --random-source ".(-e $file.".gz" ? "$file.gz" : $file)." > $model.$type.tok";
    print STDERR "extracting $line_count random lines from $file\n$cmd\n";
    print STDERR `$cmd`;
    $file = "$model.$type.tok";
  }

  my $cmd = "$lm_training -order $order $lm_settings -text $file -lm $model.$type.lm";
  $cmd .= " -vocab $model.$vocab.vocab" if defined($vocab);
  print STDERR $cmd."\n";
  print STDERR `$cmd`;

  $cmd = "$lm_binarizer $model.$type.lm $model.$type.binlm";
  print STDERR $cmd."\n";
  print STDERR `$cmd`;
}