diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2018-11-11 01:41:52 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-11-11 01:41:52 +0300 |
commit | 19a31ca3f13cc370a87a4ee2f103d1e2cdf8d34d (patch) | |
tree | 4984d941c3bbfe17d220dadb6e19cc99191cfe68 | |
parent | a2315ffd3a7f2c4c35551581e450a2f9e9a20d7b (diff) | |
parent | 4133726ef9395ee2639ab54f0ab876fe2ab2875d (diff) |
Merge pull request #205 from coylz/master
Add option "-b" (unbuffer output) to tokenizer scripts
-rwxr-xr-x | scripts/tokenizer/deescape-special-chars-PTB.perl | 5 | ||||
-rwxr-xr-x | scripts/tokenizer/deescape-special-chars.perl | 5 | ||||
-rwxr-xr-x | scripts/tokenizer/delete-long-words.perl | 6 | ||||
-rwxr-xr-x | scripts/tokenizer/escape-special-chars.perl | 5 | ||||
-rwxr-xr-x | scripts/tokenizer/lowercase.perl | 5 | ||||
-rwxr-xr-x | scripts/tokenizer/remove-non-printing-char.perl | 5 | ||||
-rwxr-xr-x | scripts/tokenizer/replace-unicode-punctuation.perl | 5 |
7 files changed, 36 insertions, 0 deletions
diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl index ad2529b21..e5ffa2840 100755 --- a/scripts/tokenizer/deescape-special-chars-PTB.perl +++ b/scripts/tokenizer/deescape-special-chars-PTB.perl @@ -6,6 +6,11 @@ use warnings; use strict; +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); # not buffered (flush each line) +} + while(<STDIN>) { s/\&bar;/\|/g; # factor separator (legacy) s/\|/\|/g; # factor separator diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl index b9d1ad74c..d4184cec4 100755 --- a/scripts/tokenizer/deescape-special-chars.perl +++ b/scripts/tokenizer/deescape-special-chars.perl @@ -6,6 +6,11 @@ use warnings; use strict; +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); # not buffered (flush each line) +} + while(<STDIN>) { s/\&bar;/\|/g; # factor separator (legacy) s/\|/\|/g; # factor separator diff --git a/scripts/tokenizer/delete-long-words.perl b/scripts/tokenizer/delete-long-words.perl index 331b601c0..ec6c8056d 100755 --- a/scripts/tokenizer/delete-long-words.perl +++ b/scripts/tokenizer/delete-long-words.perl @@ -1,6 +1,12 @@ #!/usr/bin/perl -w use strict; + +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); # not buffered (flush each line) +} + while(<STDIN>) { chop; my $first = 1; diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl index 143e85490..757572aaa 100755 --- a/scripts/tokenizer/escape-special-chars.perl +++ b/scripts/tokenizer/escape-special-chars.perl @@ -6,6 +6,11 @@ use warnings; use strict; +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); # not buffered (flush each line) +} + while(<STDIN>) { chop; diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl index bc75e5e5c..cda6f2b62 100755 --- a/scripts/tokenizer/lowercase.perl +++ b/scripts/tokenizer/lowercase.perl @@ -6,6 +6,11 @@ use warnings; use strict; +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); # not buffered (flush each line) +} + binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl index 92f6ade16..1a870f048 100755 --- a/scripts/tokenizer/remove-non-printing-char.perl +++ b/scripts/tokenizer/remove-non-printing-char.perl @@ -6,6 +6,11 @@ use warnings; use utf8; +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); # not buffered (flush each line) +} + binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl index c2c7088d6..b0bc811fe 100755 --- a/scripts/tokenizer/replace-unicode-punctuation.perl +++ b/scripts/tokenizer/replace-unicode-punctuation.perl @@ -6,6 +6,11 @@ use warnings; use strict; +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); # not buffered (flush each line) +} + #binmode(STDIN, ":utf8"); #binmode(STDOUT, ":utf8"); |