Welcome to mirror list, hosted at ThFree Co, Russian Federation.

corpusCreator.pl « Transliteration « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 8634d23ddbe00c7d9a618bae8ee63444353c8567 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/perl -w

use strict;

use utf8;
use Getopt::Std;
use IO::Handle;
binmode(STDIN,  ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');

my @source;
my @target;
my @words;
my $tPath = $ARGV[0];
my $tFile = $ARGV[1];
my $inp_ext = $ARGV[2];
my $op_ext = $ARGV[3];
my $src;
my $tgt;
my $t;
my $s;

`mkdir $tPath/training`;
`mkdir $tPath/tuning`;

open FH,  "<:encoding(UTF-8)", "$tPath/$tFile" or die "Can't open $tPath/$tFile: $!\n";
open MYSFILE,  ">:encoding(UTF-8)", "$tPath/training/corpus.$inp_ext" or die "Can't open $tPath/training/corpus.$inp_ext: $!\n";
open MYTFILE,  ">:encoding(UTF-8)", "$tPath/training/corpus.$op_ext" or die "Can't open $tPath/training/corpus.$op_ext: $!\n";

while (<FH>) 
{
    chomp;    
    my ($src,$tgt) = split(/\t/);
    
    $s = join(' ', split('',$src)); 
    $t = join(' ', split('',$tgt)); 
    print MYSFILE "$s\n";
    print MYTFILE "$t\n";	  
    push(@source, $s);
    push(@target, $t);
}

close (FH);
close (MYSFILE);
close (MYTFILE);

open MYSFILE,  ">:encoding(UTF-8)", "$tPath/training/corpusA.$inp_ext" or die "Can't open $tPath/training/corpusA.$inp_ext: $!\n";
open MYTFILE,  ">:encoding(UTF-8)", "$tPath/training/corpusA.$op_ext" or die "Can't open $tPath/training/corpusA.$op_ext: $!\n";

open MYSDEVFILE,  ">:encoding(UTF-8)", "$tPath/tuning/input" or die "Can't open $tPath/tuning/input: $!\n";
open MYTDEVFILE,  ">:encoding(UTF-8)", "$tPath/tuning/reference" or die "Can't open $tPath/tuning/reference: $!\n";

my $corpus_size = @source;
my $count = 11;
my $dev_size = 0;


   foreach (@source)
   {
         if ($count % 5 == 0 && $dev_size < 1000)
	  {
		print MYSDEVFILE "$source[$count-11]\n";
		print MYTDEVFILE "$target[$count-11]\n";
		$dev_size++;
	  }
	  else
	  {
		print MYSFILE "$source[$count-11]\n";
		print MYTFILE "$target[$count-11]\n";
	  }
	$count++;
   }

close (MYSFILE);
close (MYTFILE);
close (MYSDEVFILE);
close (MYTDEVFILE);

if ($corpus_size < 6000)
{
	`rm $tPath/training/corpusA.$inp_ext`;
	`rm $tPath/training/corpusA.$op_ext`;
}