diff options
author | Rico Sennrich <rico.sennrich@gmx.ch> | 2015-03-20 17:44:48 +0300 |
---|---|---|
committer | Rico Sennrich <rico.sennrich@gmx.ch> | 2015-03-20 17:44:48 +0300 |
commit | ca08b1d205f3506c51e40f824c2f71634d5d4c43 (patch) | |
tree | 0237bf151c0bd2eba72a099139cff22df44ebb6d | |
parent | b8ca33c34ee7108c52f5afe6c5f6dc2722c0dbab (diff) |
reduce-factors: port xml support from train-model.perl
-rwxr-xr-x | scripts/training/reduce-factors.perl | 78 |
1 files changed, 44 insertions, 34 deletions
diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl index c7269abf9..24c9be829 100755 --- a/scripts/training/reduce-factors.perl +++ b/scripts/training/reduce-factors.perl @@ -10,11 +10,12 @@ my $___FACTOR_DELIMITER = "|"; my $ZCAT = "gzip -cd"; my $BZCAT = "bzcat"; -my ($CORPUS,$REDUCED,$FACTOR); +my ($CORPUS,$REDUCED,$FACTOR,$_XML); die("ERROR: wrong syntax when invoking reduce-factors") unless &GetOptions('corpus=s' => \$CORPUS, 'reduced-corpus=s' => \$REDUCED, - 'factor=s' => \$FACTOR); + 'factor=s' => \$FACTOR, + 'xml' => \$_XML); &reduce_factors($CORPUS,$REDUCED,$FACTOR); @@ -24,9 +25,9 @@ sub reduce_factors { my @INCLUDE = sort {$a <=> $b} split(/,/,$factors); - print "Reducing factors to produce $reduced @ ".`date`; + print STDERR "(1.0.5) reducing factors to produce $reduced @ ".`date`; while(-e $reduced.".lock") { - sleep(10); + sleep(10); } if (-e $reduced) { print STDERR " $reduced in place, reusing\n"; @@ -37,29 +38,31 @@ sub reduce_factors { return; } - # peek at input, to check if we are asked to produce exactly the - # available factors - my $inh = open_or_zcat($full); - my $firstline = <$inh>; - die "Corpus file $full is empty" unless $firstline; - close $inh; - # pick first word - $firstline =~ s/^\s*//; - $firstline =~ s/\s.*//; - # count factors - my @WORD = split(/ /,$firstline); - my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]); - my $maxfactorindex = scalar(@FACTOR)-1; - if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { - # create just symlink; preserving compression - my $realfull = $full; - if (!-e $realfull && -e $realfull.".gz") { + unless ($_XML) { + # peek at input, to check if we are asked to produce exactly the + # available factors + my $inh = open_or_zcat($full); + my $firstline = <$inh>; + die "Corpus file $full is empty" unless $firstline; + close $inh; + # pick first word + $firstline =~ s/^\s*//; + $firstline =~ s/\s.*//; + # count factors + my @WORD = split(/ /,$firstline); + my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]); + my $maxfactorindex = scalar(@FACTOR)-1; + if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { + # create just symlink; preserving compression + my $realfull = $full; + if (!-e $realfull && -e $realfull.".gz") { $realfull .= ".gz"; $reduced =~ s/(\.gz)?$/.gz/; - } - safesystem("ln -s '$realfull' '$reduced'") + } + safesystem("ln -s '$realfull' '$reduced'") or die "Failed to create symlink $realfull -> $reduced"; - return; + return; + } } # The default is to select the needed factors @@ -71,23 +74,30 @@ sub reduce_factors { $nr++; print STDERR "." if $nr % 10000 == 0; print STDERR "($nr)" if $nr % 100000 == 0; - chomp; s/ +/ /g; s/^ //; s/ $//; - my $first = 1; - foreach (split) { - my @FACTOR = split /\Q$___FACTOR_DELIMITER/; + s/<\S[^>]*>/ /g if $_XML; # remove xml + chomp; s/ +/ /g; s/^ //; s/ $//; + my $first = 1; + foreach (split) { + my @FACTOR = split /\Q$___FACTOR_DELIMITER/; # \Q causes to disable metacharacters in regex - print OUT " " unless $first; - $first = 0; - my $first_factor = 1; + print OUT " " unless $first; + $first = 0; + my $first_factor = 1; foreach my $outfactor (@INCLUDE) { - print OUT "|" unless $first_factor; + print OUT $___FACTOR_DELIMITER unless $first_factor; $first_factor = 0; my $out = $FACTOR[$outfactor]; die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out; print OUT $out; } - } - print OUT "\n"; + # for(my $factor=0;$factor<=$#FACTOR;$factor++) { + # next unless defined($INCLUDE{$factor}); + # print OUT "|" unless $first_factor; + # $first_factor = 0; + # print OUT $FACTOR[$factor]; + # } + } + print OUT "\n"; } print STDERR "\n"; close(OUT); |