Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2014-06-02 18:43:35 +0400
committerHieu Hoang <hieuhoang@gmail.com>2014-06-02 18:43:35 +0400
commita134fb400034730ffda750c07bb9e73bc8b1c724 (patch)
tree22ce40fe063d8bb4308c38553fd677fd5f7b3b3b /contrib/other-builds/extract-mixed-syntax
parent934dd9b0ad1565b5587fcd2b84537cc146797a30 (diff)
add filter-by-source-word-count.perl
Diffstat (limited to 'contrib/other-builds/extract-mixed-syntax')
-rwxr-xr-xcontrib/other-builds/extract-mixed-syntax/filter-by-source-word-count.perl27
1 files changed, 27 insertions, 0 deletions
diff --git a/contrib/other-builds/extract-mixed-syntax/filter-by-source-word-count.perl b/contrib/other-builds/extract-mixed-syntax/filter-by-source-word-count.perl
new file mode 100755
index 000000000..d0e482a02
--- /dev/null
+++ b/contrib/other-builds/extract-mixed-syntax/filter-by-source-word-count.perl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl
+
+use strict;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+my $maxNumWords = $ARGV[0];
+
+while (my $line = <STDIN>) {
+ chomp($line);
+ my @toks = split(/ /,$line);
+
+ my $numSourceWords = 0;
+ my $tok = $toks[$numSourceWords];
+ while ($tok ne "|||") {
+ ++$numSourceWords;
+ $tok = $toks[$numSourceWords];
+ }
+
+ if ($numSourceWords <= $maxNumWords) {
+ print "$line\n";
+ }
+}
+
+