#!/usr/bin/env perl use warnings; use strict; my ($indomain_source,,$indomain_target,$outdomain_source,$outdomain_target,$lm_training,$lm_binarizer,$order,$lm_settings,$line_count,$model); use Getopt::Long; GetOptions('in-source=s' => \$indomain_source, 'in-target=s' => \$indomain_target, 'out-source=s' => \$outdomain_source, 'out-target=s' => \$outdomain_target, 'model=s' => \$model, 'lm-training=s' => \$lm_training, 'lm-binarizer=s' => \$lm_binarizer, 'order=s' => \$order, 'lm-settings=s' => \$lm_settings, 'line-count=i' => \$line_count ) or exit(1); die("ERROR: in-domain source file not specified (-in-source FILE)") unless defined($indomain_source); die("ERROR: in-domain target file not specified (-in-target FILE)") unless defined($indomain_target); die("ERROR: out-of-domain source file not specified (-out-source FILE)") unless defined($outdomain_source); die("ERROR: out-of-domain target file not specified (-out-target FILE)") unless defined($outdomain_target); die("ERROR: in-domain source file '$indomain_source' not found") unless -e $indomain_source || -e $indomain_source.".gz"; die("ERROR: in-domain target file '$indomain_target' not found") unless -e $indomain_target || -e $indomain_target.".gz"; die("ERROR: out-of-domain source file '$outdomain_source' not found") unless -e $outdomain_source || -e $outdomain_source.".gz"; die("ERROR: out-of-domain target file '$outdomain_target' not found") unless -e $outdomain_target || -e $outdomain_target.".gz"; die("ERROR: language model order not specified (-order NUM)") unless defined($order); die("ERROR: language model settings not specified (-lm-settings STRING)") unless defined($lm_settings); die("ERROR: language model command not specified (-lm-training CMD)") unless defined($lm_training); die("ERROR: language model binarizer not specified (-lm-binarizer CMD)") unless defined($lm_binarizer); die("ERROR: model not specified (-model FILESTEM)") unless defined($model); &train_lm($indomain_source,"in-source"); &train_lm($indomain_target,"in-target"); &extract_vocabulary("in-source"); &extract_vocabulary("in-target"); &train_lm($outdomain_source,"out-source","in-source"); &train_lm($outdomain_target,"out-target","in-target"); sub extract_vocabulary { my ($type) = @_; print STDERR "extracting vocabulary from $type language model\n"; open(LM,"$model.$type.lm"); open(VOCAB,">$model.$type.vocab"); my $unigrams = 0; while() { $unigrams = 1 if /^\\1-grams:/; last if /^\\2-grams:/; next unless $unigrams; my @TOKEN = split(/\s/); next unless @TOKEN == 3; next if $TOKEN[1] eq ''; next if $TOKEN[1] eq ''; next if $TOKEN[1] eq '<\\s>'; print VOCAB $TOKEN[1]."\n"; } close(LM); close(VOCAB); } sub train_lm { my ($file,$type,$vocab) = @_; print STDERR "training $type language model\n"; if (defined($line_count)) { my $cmd = (-e $file.".gz" ? "zcat $file.gz" : "cat $file"); $cmd .= " | shuf -n $line_count --random-source ".(-e $file.".gz" ? "$file.gz" : $file)." > $model.$type.tok"; print STDERR "extracting $line_count random lines from $file\n$cmd\n"; print STDERR `$cmd`; $file = "$model.$type.tok"; } my $cmd = "$lm_training -order $order $lm_settings -text $file -lm $model.$type.lm"; $cmd .= " -vocab $model.$vocab.vocab" if defined($vocab); print STDERR $cmd."\n"; print STDERR `$cmd`; $cmd = "$lm_binarizer $model.$type.lm $model.$type.binlm"; print STDERR $cmd."\n"; print STDERR `$cmd`; }