#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. use warnings; use strict; use utf8; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); use IO::Handle; use File::Basename; binmode(STDIN, ':utf8'); binmode(STDOUT, ':utf8'); binmode(STDERR, ':utf8'); my $OUT_DIR = "/tmp/Transliteration-Phrase-Table.$$"; my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$OOV_FILE,$EXTERNAL_BIN_DIR, $INPUT_EXTENSION, $OUTPUT_EXTENSION); die("ERROR: wrong syntax when invoking train-transliteration-PT.pl") unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR, 'external-bin-dir=s' => \$EXTERNAL_BIN_DIR, 'transliteration-model-dir=s' => \$TRANSLIT_MODEL, 'input-extension=s' => \$INPUT_EXTENSION, 'output-extension=s' => \$OUTPUT_EXTENSION, 'out-dir=s' => \$OUT_DIR, 'oov-file=s' => \$OOV_FILE); # check if the files are in place die("ERROR: you need to define --moses-src-dir --external-bin-dir, --transliteration-model-dir, --oov-file, --input-extension, --output-extension") unless (defined($MOSES_SRC_DIR) && defined($TRANSLIT_MODEL) && defined($OOV_FILE) && defined($INPUT_EXTENSION)&& defined($OUTPUT_EXTENSION)); die("ERROR: could not find Transliteration Model '$TRANSLIT_MODEL'") unless -e $TRANSLIT_MODEL; die("ERROR: could not find OOV file '$OOV_FILE'") unless -e $OOV_FILE; my $UNK_FILE_NAME = basename($OOV_FILE); `mkdir -p $OUT_DIR/$UNK_FILE_NAME/training`; `cp $OOV_FILE $OUT_DIR/$UNK_FILE_NAME/$UNK_FILE_NAME`; my $translitFile = "$OUT_DIR/" . $UNK_FILE_NAME . "/" . $UNK_FILE_NAME . ".translit"; print STDERR "Preparing for Transliteration\n"; prepare_for_transliteration ($OOV_FILE , $translitFile); print STDERR "Run Transliteration\n"; run_transliteration ($MOSES_SRC_DIR , $EXTERNAL_BIN_DIR , $TRANSLIT_MODEL , $translitFile); print STDERR "Form Transliteration Corpus\n"; form_corpus ($translitFile , $translitFile.".op.nBest" , $OUT_DIR); ################### Read the UNK word file and prepare for Transliteration ############################### sub prepare_for_transliteration { my @list = @_; my $testFile = $list[0]; my $translitFile = $list[1]; my %UNK; my @words; my $src; open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n"; while () { chomp; #print "$_\n"; @words = split(/ /, "$_"); foreach (@words) { $UNK{"$_"} = 1; } } close (MYFILE); open MYFILE, ">:encoding(UTF-8)", $translitFile or die "Can't open $translitFile: $!\n"; foreach my $key ( keys %UNK ) { $src=join(' ', split('',$key)); print MYFILE "$src\n"; } close (MYFILE); } ################### Run Transliteration Module to Obtain Transliterations ############################### sub run_transliteration { my @list = @_; my $MOSES_SRC = $list[0]; my $EXTERNAL_BIN_DIR = $list[1]; my $TRANSLIT_MODEL = $list[2]; my $eval_file = $list[3]; `touch $eval_file.moses.table.ini`; print STDERR "Filter Table\n"; `$MOSES_SRC/scripts/training/train-model.perl \\ -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ -reordering msd-bidirectional-fe -score-options '--KneserNey' \\ -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\ -reordering-table $TRANSLIT_MODEL/model/reordering-table \\ -config $eval_file.moses.table.ini \\ -lm 0:3:$eval_file.moses.table.ini:8`; `$MOSES_SRC/scripts/training/filter-model-given-input.pl \\ $eval_file.filtered $eval_file.moses.table.ini $eval_file \\ -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; `rm $eval_file.moses.table.ini`; print STDERR "Apply Filter\n"; `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\ $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \\ $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`; `$MOSES_SRC/bin/moses \\ -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\ -threads 16 -drop-unknown -distortion-limit 0 \\ -n-best-list $eval_file.op.nBest 50 \\ -f $eval_file.filtered.ini \\ < $eval_file \\ > $eval_file.op`; } ################### Read the output of Transliteration Model and Form Corpus ############################### sub form_corpus { my @list = @_; my $inp_file = $list[0]; my $testFile = $list[1]; my $EVAL_DIR = $list[2]; my %vocab; my @words; my $thisStr; my $UNK_FILE_NAME = basename($OOV_FILE); my $target = $EVAL_DIR . "/$UNK_FILE_NAME/training/corpus.$OUTPUT_EXTENSION"; my $outFile = "$EVAL_DIR/out.txt"; open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n"; open OUTFILE, ">:encoding(UTF-8)", $outFile or die "Can't open $outFile: $!\n"; while () { chomp; #print "$_\n"; @words = split(/ /, "$_"); my $i = 2; my $prob; $thisStr = ""; while ($words[$i] ne "|||") { $thisStr = $thisStr . $words[$i]; $i++; } $i++; while ($words[$i] ne "|||") { $i++; } $i++; $prob = $words[$i]; print OUTFILE "$thisStr\t$prob\n"; } close (MYFILE); close (OUTFILE); }