Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpjwilliams <pjwilliams@1f5c12ca-751b-0410-a591-d2e778427230>2010-04-16 13:45:51 +0400
committerpjwilliams <pjwilliams@1f5c12ca-751b-0410-a591-d2e778427230>2010-04-16 13:45:51 +0400
commit2edfc169122a0e7881e55fb451773142274f2556 (patch)
tree260c6947743baff315bf899a05987726723c6110 /scripts/training/clean-corpus-n.perl
parenta2233d0f8da8e78611bdf9466419ff7dc4a7e724 (diff)
Merge remaining script support for tree-based models from mt3_chart.
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3137 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/training/clean-corpus-n.perl')
-rwxr-xr-xscripts/training/clean-corpus-n.perl6
1 files changed, 4 insertions, 2 deletions
diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index d15115c30..84f347c79 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -5,6 +5,7 @@ use strict;
use Getopt::Long;
my $help;
my $lc = 0; # lowercase the corpus?
+my $ignore_ratio = 0;
my $enc = "utf8"; # encoding of the input and output files
# set to anything else you wish, but I have not tested it yet
@@ -12,6 +13,7 @@ GetOptions(
"help" => \$help,
"lowercase|lc" => \$lc,
"encoding=s" => \$enc,
+ "ignore-ratio" => \$ignore_ratio
) or exit(1);
if (scalar(@ARGV) < 6 || $help) {
@@ -102,8 +104,8 @@ while(my $f = <F>) {
next if scalar(@F) > $max;
next if scalar(@E) < $min;
next if scalar(@F) < $min;
- next if scalar(@E)/scalar(@F) > 9;
- next if scalar(@F)/scalar(@E) > 9;
+ next if !$ignore_ratio && scalar(@E)/scalar(@F) > 9;
+ next if !$ignore_ratio && scalar(@F)/scalar(@E) > 9;
# An extra check: none of the factors can be blank!
die "There is a blank factor in $corpus.$l1 on line $innr: $f"