#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. use warnings; use strict; use utf8; use Getopt::Std; use IO::Handle; binmode(STDIN, ':utf8'); binmode(STDOUT, ':utf8'); binmode(STDERR, ':utf8'); my @source; my @target; my @words; my $tPath = $ARGV[0]; my $tFile = $ARGV[1]; my $inp_ext = $ARGV[2]; my $op_ext = $ARGV[3]; my $src; my $tgt; my $t; my $s; `mkdir $tPath/training`; `mkdir $tPath/tuning`; open FH, "<:encoding(UTF-8)", "$tPath/$tFile" or die "Can't open $tPath/$tFile: $!\n"; open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$inp_ext" or die "Can't open $tPath/training/corpus.$inp_ext: $!\n"; open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$op_ext" or die "Can't open $tPath/training/corpus.$op_ext: $!\n"; while () { chomp; my ($src,$tgt) = split(/\t/); $s = join(' ', split('',$src)); $t = join(' ', split('',$tgt)); print MYSFILE "$s\n"; print MYTFILE "$t\n"; push(@source, $s); push(@target, $t); } close (FH); close (MYSFILE); close (MYTFILE); open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$inp_ext" or die "Can't open $tPath/training/corpusA.$inp_ext: $!\n"; open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$op_ext" or die "Can't open $tPath/training/corpusA.$op_ext: $!\n"; open MYSDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/input" or die "Can't open $tPath/tuning/input: $!\n"; open MYTDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/reference" or die "Can't open $tPath/tuning/reference: $!\n"; my $corpus_size = @source; my $count = 11; my $dev_size = 0; foreach (@source) { if ($count % 5 == 0 && $dev_size < 1000) { print MYSDEVFILE "$source[$count-11]\n"; print MYTDEVFILE "$target[$count-11]\n"; $dev_size++; } else { print MYSFILE "$source[$count-11]\n"; print MYTFILE "$target[$count-11]\n"; } $count++; } close (MYSFILE); close (MYTFILE); close (MYSDEVFILE); close (MYTDEVFILE); if ($corpus_size < 6000) { `rm $tPath/training/corpusA.$inp_ext`; `rm $tPath/training/corpusA.$op_ext`; }