Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRico Sennrich <rico.sennrich@gmx.ch>2015-03-20 17:44:48 +0300
committerRico Sennrich <rico.sennrich@gmx.ch>2015-03-20 17:44:48 +0300
commitca08b1d205f3506c51e40f824c2f71634d5d4c43 (patch)
tree0237bf151c0bd2eba72a099139cff22df44ebb6d
parentb8ca33c34ee7108c52f5afe6c5f6dc2722c0dbab (diff)
reduce-factors: port xml support from train-model.perl
-rwxr-xr-xscripts/training/reduce-factors.perl78
1 files changed, 44 insertions, 34 deletions
diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl
index c7269abf9..24c9be829 100755
--- a/scripts/training/reduce-factors.perl
+++ b/scripts/training/reduce-factors.perl
@@ -10,11 +10,12 @@ my $___FACTOR_DELIMITER = "|";
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
-my ($CORPUS,$REDUCED,$FACTOR);
+my ($CORPUS,$REDUCED,$FACTOR,$_XML);
die("ERROR: wrong syntax when invoking reduce-factors")
unless &GetOptions('corpus=s' => \$CORPUS,
'reduced-corpus=s' => \$REDUCED,
- 'factor=s' => \$FACTOR);
+ 'factor=s' => \$FACTOR,
+ 'xml' => \$_XML);
&reduce_factors($CORPUS,$REDUCED,$FACTOR);
@@ -24,9 +25,9 @@ sub reduce_factors {
my @INCLUDE = sort {$a <=> $b} split(/,/,$factors);
- print "Reducing factors to produce $reduced @ ".`date`;
+ print STDERR "(1.0.5) reducing factors to produce $reduced @ ".`date`;
while(-e $reduced.".lock") {
- sleep(10);
+ sleep(10);
}
if (-e $reduced) {
print STDERR " $reduced in place, reusing\n";
@@ -37,29 +38,31 @@ sub reduce_factors {
return;
}
- # peek at input, to check if we are asked to produce exactly the
- # available factors
- my $inh = open_or_zcat($full);
- my $firstline = <$inh>;
- die "Corpus file $full is empty" unless $firstline;
- close $inh;
- # pick first word
- $firstline =~ s/^\s*//;
- $firstline =~ s/\s.*//;
- # count factors
- my @WORD = split(/ /,$firstline);
- my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]);
- my $maxfactorindex = scalar(@FACTOR)-1;
- if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
- # create just symlink; preserving compression
- my $realfull = $full;
- if (!-e $realfull && -e $realfull.".gz") {
+ unless ($_XML) {
+ # peek at input, to check if we are asked to produce exactly the
+ # available factors
+ my $inh = open_or_zcat($full);
+ my $firstline = <$inh>;
+ die "Corpus file $full is empty" unless $firstline;
+ close $inh;
+ # pick first word
+ $firstline =~ s/^\s*//;
+ $firstline =~ s/\s.*//;
+ # count factors
+ my @WORD = split(/ /,$firstline);
+ my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]);
+ my $maxfactorindex = scalar(@FACTOR)-1;
+ if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
+ # create just symlink; preserving compression
+ my $realfull = $full;
+ if (!-e $realfull && -e $realfull.".gz") {
$realfull .= ".gz";
$reduced =~ s/(\.gz)?$/.gz/;
- }
- safesystem("ln -s '$realfull' '$reduced'")
+ }
+ safesystem("ln -s '$realfull' '$reduced'")
or die "Failed to create symlink $realfull -> $reduced";
- return;
+ return;
+ }
}
# The default is to select the needed factors
@@ -71,23 +74,30 @@ sub reduce_factors {
$nr++;
print STDERR "." if $nr % 10000 == 0;
print STDERR "($nr)" if $nr % 100000 == 0;
- chomp; s/ +/ /g; s/^ //; s/ $//;
- my $first = 1;
- foreach (split) {
- my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
+ s/<\S[^>]*>/ /g if $_XML; # remove xml
+ chomp; s/ +/ /g; s/^ //; s/ $//;
+ my $first = 1;
+ foreach (split) {
+ my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
# \Q causes to disable metacharacters in regex
- print OUT " " unless $first;
- $first = 0;
- my $first_factor = 1;
+ print OUT " " unless $first;
+ $first = 0;
+ my $first_factor = 1;
foreach my $outfactor (@INCLUDE) {
- print OUT "|" unless $first_factor;
+ print OUT $___FACTOR_DELIMITER unless $first_factor;
$first_factor = 0;
my $out = $FACTOR[$outfactor];
die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
print OUT $out;
}
- }
- print OUT "\n";
+ # for(my $factor=0;$factor<=$#FACTOR;$factor++) {
+ # next unless defined($INCLUDE{$factor});
+ # print OUT "|" unless $first_factor;
+ # $first_factor = 0;
+ # print OUT $FACTOR[$factor];
+ # }
+ }
+ print OUT "\n";
}
print STDERR "\n";
close(OUT);