Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2018-11-11 01:41:52 +0300
committerGitHub <noreply@github.com>2018-11-11 01:41:52 +0300
commit19a31ca3f13cc370a87a4ee2f103d1e2cdf8d34d (patch)
tree4984d941c3bbfe17d220dadb6e19cc99191cfe68
parenta2315ffd3a7f2c4c35551581e450a2f9e9a20d7b (diff)
parent4133726ef9395ee2639ab54f0ab876fe2ab2875d (diff)
Merge pull request #205 from coylz/master
Add option "-b" (unbuffer output) to tokenizer scripts
-rwxr-xr-xscripts/tokenizer/deescape-special-chars-PTB.perl5
-rwxr-xr-xscripts/tokenizer/deescape-special-chars.perl5
-rwxr-xr-xscripts/tokenizer/delete-long-words.perl6
-rwxr-xr-xscripts/tokenizer/escape-special-chars.perl5
-rwxr-xr-xscripts/tokenizer/lowercase.perl5
-rwxr-xr-xscripts/tokenizer/remove-non-printing-char.perl5
-rwxr-xr-xscripts/tokenizer/replace-unicode-punctuation.perl5
7 files changed, 36 insertions, 0 deletions
diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl
index ad2529b21..e5ffa2840 100755
--- a/scripts/tokenizer/deescape-special-chars-PTB.perl
+++ b/scripts/tokenizer/deescape-special-chars-PTB.perl
@@ -6,6 +6,11 @@
use warnings;
use strict;
+while (@ARGV) {
+ $_ = shift;
+ /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
while(<STDIN>) {
s/\&bar;/\|/g; # factor separator (legacy)
s/\&#124;/\|/g; # factor separator
diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl
index b9d1ad74c..d4184cec4 100755
--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@@ -6,6 +6,11 @@
use warnings;
use strict;
+while (@ARGV) {
+ $_ = shift;
+ /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
while(<STDIN>) {
s/\&bar;/\|/g; # factor separator (legacy)
s/\&#124;/\|/g; # factor separator
diff --git a/scripts/tokenizer/delete-long-words.perl b/scripts/tokenizer/delete-long-words.perl
index 331b601c0..ec6c8056d 100755
--- a/scripts/tokenizer/delete-long-words.perl
+++ b/scripts/tokenizer/delete-long-words.perl
@@ -1,6 +1,12 @@
#!/usr/bin/perl -w
use strict;
+
+while (@ARGV) {
+ $_ = shift;
+ /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
while(<STDIN>) {
chop;
my $first = 1;
diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl
index 143e85490..757572aaa 100755
--- a/scripts/tokenizer/escape-special-chars.perl
+++ b/scripts/tokenizer/escape-special-chars.perl
@@ -6,6 +6,11 @@
use warnings;
use strict;
+while (@ARGV) {
+ $_ = shift;
+ /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
while(<STDIN>) {
chop;
diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl
index bc75e5e5c..cda6f2b62 100755
--- a/scripts/tokenizer/lowercase.perl
+++ b/scripts/tokenizer/lowercase.perl
@@ -6,6 +6,11 @@
use warnings;
use strict;
+while (@ARGV) {
+ $_ = shift;
+ /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl
index 92f6ade16..1a870f048 100755
--- a/scripts/tokenizer/remove-non-printing-char.perl
+++ b/scripts/tokenizer/remove-non-printing-char.perl
@@ -6,6 +6,11 @@
use warnings;
use utf8;
+while (@ARGV) {
+ $_ = shift;
+ /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl
index c2c7088d6..b0bc811fe 100755
--- a/scripts/tokenizer/replace-unicode-punctuation.perl
+++ b/scripts/tokenizer/replace-unicode-punctuation.perl
@@ -6,6 +6,11 @@
use warnings;
use strict;
+while (@ARGV) {
+ $_ = shift;
+ /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
#binmode(STDIN, ":utf8");
#binmode(STDOUT, ":utf8");