Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'example/preprocess.pl')
-rwxr-xr-xexample/preprocess.pl16
1 files changed, 16 insertions, 0 deletions
diff --git a/example/preprocess.pl b/example/preprocess.pl
new file mode 100755
index 0000000..31432dd
--- /dev/null
+++ b/example/preprocess.pl
@@ -0,0 +1,16 @@
+#!/usr/bin/env perl
+
+while (<>) {
+ next if (/^\s*$/);
+ next if (/^CANTO /);
+
+ s/^/ /; s/$/ /;
+
+ # punctuation
+ s/([,.!?:;\(\)"]|-+)/ $1 /g;
+ # lowercase
+ tr/A-Z/a-z/;
+ # single quotes are too much trouble
+
+ print(join(" ", split) . "\n");
+}