Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2007-06-18 19:50:04 +0400
committerphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2007-06-18 19:50:04 +0400
commit960bebdd4a6267ab2afa576285268c6322a810c5 (patch)
treecb7fd6c4bc297f471a750f97c595501f5e36279c /scripts/training/clean-corpus-n.perl
parentc99167d709f8658f3ab18cf3e36e5fef1887ec90 (diff)
fixed clean script to handle '|'s
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1416 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/training/clean-corpus-n.perl')
-rwxr-xr-xscripts/training/clean-corpus-n.perl8
1 files changed, 6 insertions, 2 deletions
diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index c410f3c4a..5b2779931 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -61,6 +61,7 @@ binmode(EO, $binmode);
my $innr = 0;
my $outnr = 0;
+my $factored_flag;
while(my $f = <F>) {
$innr++;
print STDERR "." if $innr % 10000 == 0;
@@ -69,6 +70,9 @@ while(my $f = <F>) {
die "$corpus.$l2 is too short!" if !defined $e;
chomp($e);
chomp($f);
+ if ($innr == 1) {
+ $factored_flag = ($e =~ /\|/ || $f =~ /\|/);
+ }
#if lowercasing, lowercase
if ($lc) {
@@ -76,11 +80,11 @@ while(my $f = <F>) {
$f = lc($f);
}
- # $e =~ s/\|//g; # kinda hurts in factored input
+ $e =~ s/\|//g unless $factored_flag;
$e =~ s/\s+/ /g;
$e =~ s/^ //;
$e =~ s/ $//;
- # $f =~ s/\|//g; # kinda hurts in factored input
+ $f =~ s/\|//g unless $factored_flag;
$f =~ s/\s+/ /g;
$f =~ s/^ //;
$f =~ s/ $//;