Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoralvations <alvations@gmail.com>2015-04-26 21:38:27 +0300
committeralvations <alvations@gmail.com>2015-04-26 21:38:27 +0300
commitfa30ea671242fedecc65675bd4f5edbca59d5053 (patch)
treebb885fafa74390cb93ac2e414870845f012df8f9 /scripts
parent4a68c42b16626e2ee707e93a6453eda51dc807a1 (diff)
parentec54ea3c4fcdb055661dba1fe3003d6bb1a0bed8 (diff)
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder into moses-smt-master
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/OSM/OSM-Train.perl3
-rwxr-xr-xscripts/OSM/extract-singletons.perl3
-rwxr-xr-xscripts/OSM/flipAlignment.perl6
-rwxr-xr-xscripts/Transliteration/clean.pl3
-rwxr-xr-xscripts/Transliteration/corpusCreator.pl3
-rwxr-xr-xscripts/Transliteration/in-decoding-transliteration.pl3
-rwxr-xr-xscripts/Transliteration/post-decoding-transliteration.pl3
-rwxr-xr-xscripts/Transliteration/prepare-transliteration-phrase-table.pl3
-rwxr-xr-xscripts/Transliteration/threshold.pl3
-rwxr-xr-xscripts/Transliteration/train-transliteration-module.pl3
-rwxr-xr-xscripts/analysis/bootstrap-hypothesis-difference-significance.pl3
-rwxr-xr-xscripts/analysis/nontranslated_words.pl2
-rwxr-xr-xscripts/analysis/oov.pl2
-rwxr-xr-xscripts/analysis/sentence-by-sentence.pl3
-rwxr-xr-xscripts/analysis/sg2dot.perl3
-rwxr-xr-xscripts/analysis/show-phrases-used.pl4
-rwxr-xr-xscripts/analysis/smtgui/filter-phrase-table.pl3
-rwxr-xr-xscripts/analysis/suspicious_tokenization.pl2
-rwxr-xr-xscripts/analysis/weight-scan.pl2
-rw-r--r--scripts/ems/experiment.meta60
-rwxr-xr-xscripts/ems/experiment.perl18
-rwxr-xr-xscripts/ems/fix-info.perl3
-rwxr-xr-xscripts/ems/support/analysis.perl3
-rwxr-xr-xscripts/ems/support/build-domain-file-from-subcorpora.perl3
-rwxr-xr-xscripts/ems/support/build-sparse-features.perl3
-rwxr-xr-xscripts/ems/support/consolidate-training-data.perl3
-rwxr-xr-xscripts/ems/support/generic-multicore-parallelizer.perl3
-rwxr-xr-xscripts/ems/support/generic-parallelizer.perl3
-rwxr-xr-xscripts/ems/support/input-from-sgm.perl3
-rwxr-xr-xscripts/ems/support/interpolate-lm.perl3
-rwxr-xr-xscripts/ems/support/lmplz-wrapper.perl25
-rwxr-xr-xscripts/ems/support/mml-filter.perl3
-rwxr-xr-xscripts/ems/support/mml-score.perl3
-rwxr-xr-xscripts/ems/support/mml-train.perl3
-rwxr-xr-xscripts/ems/support/prepare-fast-align.perl5
-rwxr-xr-xscripts/ems/support/reference-from-sgm.perl3
-rwxr-xr-xscripts/ems/support/remove-segmentation-markup.perl3
-rwxr-xr-xscripts/ems/support/report-experiment-scores.perl3
-rwxr-xr-xscripts/ems/support/run-command-on-multiple-refsets.perl3
-rwxr-xr-xscripts/ems/support/run-wade.perl3
-rwxr-xr-xscripts/ems/support/split-sentences.perl3
-rwxr-xr-xscripts/ems/support/submit-grid.perl62
-rwxr-xr-xscripts/ems/support/substitute-filtered-tables-and-weights.perl3
-rwxr-xr-xscripts/ems/support/substitute-filtered-tables.perl4
-rwxr-xr-xscripts/ems/support/substitute-weights.perl4
-rwxr-xr-xscripts/ems/support/symmetrize-fast-align.perl3
-rwxr-xr-xscripts/ems/support/thot-lm-wrapper.perl3
-rwxr-xr-xscripts/ems/support/tree-converter-wrapper.perl2
-rwxr-xr-xscripts/ems/support/wrap-xml.perl3
-rw-r--r--scripts/ems/web/analysis.php8
-rw-r--r--scripts/ems/web/base64.js285
-rw-r--r--scripts/ems/web/bilingual-concordance.css1
-rw-r--r--scripts/ems/web/index.php2
-rw-r--r--scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc6
-rw-r--r--scripts/ems/web/overview.php5
-rwxr-xr-xscripts/ems/web/progress.perl3
-rwxr-xr-xscripts/fuzzy-match/create_xml.perl3
-rwxr-xr-xscripts/generic/compound-splitter.perl3
-rwxr-xr-xscripts/generic/extract-factors.pl3
-rwxr-xr-xscripts/generic/extract-parallel.perl7
-rwxr-xr-xscripts/generic/fsa2fsal.pl3
-rwxr-xr-xscripts/generic/fsa2plf.pl3
-rwxr-xr-xscripts/generic/fsal2fsa.pl3
-rwxr-xr-xscripts/generic/generic-parallel.perl3
-rwxr-xr-xscripts/generic/giza-parallel.perl3
-rwxr-xr-xscripts/generic/lopar2pos.pl4
-rwxr-xr-xscripts/generic/moses-parallel.pl3
-rwxr-xr-xscripts/generic/mteval-v12.pl3
-rwxr-xr-xscripts/generic/mteval-v13a.pl2
-rwxr-xr-xscripts/generic/multi-bleu.perl3
-rwxr-xr-xscripts/generic/ph_numbers.perl4
-rwxr-xr-xscripts/generic/qsub-wrapper.pl3
-rwxr-xr-xscripts/generic/reverse-alignment.perl3
-rwxr-xr-xscripts/generic/score-parallel.perl7
-rwxr-xr-xscripts/generic/strip-xml.perl3
-rwxr-xr-xscripts/generic/trainlm-irst2.perl3
-rwxr-xr-xscripts/generic/trainlm-lmplz.perl40
-rwxr-xr-xscripts/other/beautify.perl3
-rwxr-xr-xscripts/other/convert-pt.perl2
-rwxr-xr-xscripts/other/delete-scores.perl3
-rwxr-xr-xscripts/other/get_many_translations_from_google.perl4
-rwxr-xr-xscripts/other/retain-lines.perl2
-rwxr-xr-xscripts/other/translate_by_microsoft_bing.perl3
-rwxr-xr-xscripts/recaser/detruecase.perl3
-rwxr-xr-xscripts/recaser/recase.perl3
-rwxr-xr-xscripts/recaser/train-recaser.perl3
-rwxr-xr-xscripts/recaser/train-truecaser.perl3
-rwxr-xr-xscripts/recaser/truecase.perl4
-rwxr-xr-xscripts/regression-testing/compare-results.pl3
-rwxr-xr-xscripts/regression-testing/create_localized_moses_ini.pl3
-rwxr-xr-xscripts/regression-testing/modify-pars.pl3
-rwxr-xr-xscripts/regression-testing/moses-virtual.pl3
-rwxr-xr-xscripts/regression-testing/run-single-test.pl3
-rwxr-xr-xscripts/regression-testing/run-test-suite.pl3
-rwxr-xr-xscripts/tokenizer/deescape-special-chars-PTB.perl3
-rwxr-xr-xscripts/tokenizer/deescape-special-chars.perl3
-rwxr-xr-xscripts/tokenizer/detokenizer.perl11
-rwxr-xr-xscripts/tokenizer/escape-special-chars.perl3
-rwxr-xr-xscripts/tokenizer/lowercase.perl3
-rwxr-xr-xscripts/tokenizer/normalize-punctuation.perl3
-rwxr-xr-xscripts/tokenizer/pre-tok-clean.perl46
-rwxr-xr-xscripts/tokenizer/pre-tokenizer.perl4
-rw-r--r--scripts/tokenizer/pre_tokenize_cleaning.py78
-rwxr-xr-xscripts/tokenizer/remove-non-printing-char.perl3
-rwxr-xr-xscripts/tokenizer/replace-unicode-punctuation.perl3
-rwxr-xr-xscripts/tokenizer/tokenizer.perl10
-rwxr-xr-xscripts/tokenizer/tokenizer_PTB.perl3
-rwxr-xr-xscripts/training/absolutize_moses_model.pl4
-rwxr-xr-xscripts/training/analyse_moses_model.pl2
-rwxr-xr-xscripts/training/bilingual-lm/extract_training.py2
-rwxr-xr-xscripts/training/binarize-model.perl3
-rwxr-xr-xscripts/training/build-generation-table.perl3
-rwxr-xr-xscripts/training/build-mmsapt.perl3
-rwxr-xr-xscripts/training/clean-corpus-n.perl9
-rwxr-xr-xscripts/training/clone_moses_model.pl3
-rwxr-xr-xscripts/training/combine_factors.pl2
-rwxr-xr-xscripts/training/convert-moses-ini-to-v2.perl3
-rwxr-xr-xscripts/training/corpus-sizes.perl3
-rwxr-xr-xscripts/training/exodus.perl3
-rwxr-xr-xscripts/training/get-lexical.perl3
-rwxr-xr-xscripts/training/giza2bal.pl6
-rwxr-xr-xscripts/training/mert-moses.pl3
-rwxr-xr-xscripts/training/postprocess-lopar.perl3
-rwxr-xr-xscripts/training/reduce-factors.perl81
-rwxr-xr-xscripts/training/reduce-topt-count.pl2
-rwxr-xr-xscripts/training/reduce_combine.pl2
-rwxr-xr-xscripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl3
-rwxr-xr-xscripts/training/strip-xml.perl17
-rwxr-xr-xscripts/training/threshold-filter.perl3
-rwxr-xr-xscripts/training/train-global-lexicon-model.perl3
-rwxr-xr-xscripts/training/train-model.perl7
-rwxr-xr-xscripts/training/wrappers/berkeleyparsed2mosesxml.perl3
-rwxr-xr-xscripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl3
-rwxr-xr-xscripts/training/wrappers/conll2mosesxml.py2
-rwxr-xr-xscripts/training/wrappers/filter-excluded-lines.perl3
-rwxr-xr-xscripts/training/wrappers/find-unparseable.perl3
-rwxr-xr-xscripts/training/wrappers/mada-wrapper.perl3
-rwxr-xr-xscripts/training/wrappers/madamira-wrapper.perl93
-rwxr-xr-xscripts/training/wrappers/make-factor-brown-cluster-mkcls.perl3
-rwxr-xr-xscripts/training/wrappers/make-factor-de-morph.perl3
-rwxr-xr-xscripts/training/wrappers/make-factor-de-pos.perl3
-rwxr-xr-xscripts/training/wrappers/make-factor-en-pos.mxpost.perl3
-rwxr-xr-xscripts/training/wrappers/make-factor-pos.tree-tagger.perl3
-rwxr-xr-xscripts/training/wrappers/make-factor-stem.perl3
-rwxr-xr-xscripts/training/wrappers/make-factor-suffix.perl3
-rwxr-xr-xscripts/training/wrappers/mosesxml2berkeleyparsed.perl3
-rwxr-xr-xscripts/training/wrappers/mosesxml2brackets.py51
-rwxr-xr-xscripts/training/wrappers/parse-de-berkeley.perl3
-rwxr-xr-xscripts/training/wrappers/parse-de-bitpar.perl3
-rwxr-xr-xscripts/training/wrappers/parse-en-collins.perl3
-rwxr-xr-xscripts/training/wrappers/parse-en-egret.perl3
-rwxr-xr-xscripts/training/wrappers/syntax-hyphen-splitting.perl3
-rwxr-xr-xscripts/training/wrappers/tagger-german-chunk.perl3
153 files changed, 961 insertions, 361 deletions
diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl
index ae5a386fa..e2b604f0b 100755
--- a/scripts/OSM/OSM-Train.perl
+++ b/scripts/OSM/OSM-Train.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/OSM/extract-singletons.perl b/scripts/OSM/extract-singletons.perl
index 33f857929..83719502f 100755
--- a/scripts/OSM/extract-singletons.perl
+++ b/scripts/OSM/extract-singletons.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use Getopt::Std;
getopts('q');
diff --git a/scripts/OSM/flipAlignment.perl b/scripts/OSM/flipAlignment.perl
index e738802b1..3559bf79b 100755
--- a/scripts/OSM/flipAlignment.perl
+++ b/scripts/OSM/flipAlignment.perl
@@ -1,5 +1,7 @@
-#! /usr/bin/perl
- use strict;
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
my $file = shift(@ARGV);
open(MYFILE, $file);
diff --git a/scripts/Transliteration/clean.pl b/scripts/Transliteration/clean.pl
index 41a55c4eb..c59bf0798 100755
--- a/scripts/Transliteration/clean.pl
+++ b/scripts/Transliteration/clean.pl
@@ -1,6 +1,7 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
#input hindi word urdu word, delete all those entries that have number on any side
+use warnings;
use utf8;
use Getopt::Std;
diff --git a/scripts/Transliteration/corpusCreator.pl b/scripts/Transliteration/corpusCreator.pl
index 8634d23dd..d2df8323c 100755
--- a/scripts/Transliteration/corpusCreator.pl
+++ b/scripts/Transliteration/corpusCreator.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl
index ebf1c490b..216d99a3e 100755
--- a/scripts/Transliteration/in-decoding-transliteration.pl
+++ b/scripts/Transliteration/in-decoding-transliteration.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl
index 578160ba2..201f40d97 100755
--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
index dfd1ed4de..4fc03b526 100755
--- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl
+++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/Transliteration/threshold.pl b/scripts/Transliteration/threshold.pl
index 9b34bd12c..8e3704fd6 100755
--- a/scripts/Transliteration/threshold.pl
+++ b/scripts/Transliteration/threshold.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use utf8;
require Encode;
use IO::Handle;
diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl
index ed7f32097..05804afb6 100755
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use utf8;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
index 50492cad0..149676b6f 100755
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
use utf8;
###############################################
@@ -14,6 +14,7 @@ use utf8;
# 23.01.2010: added NIST p-value and interval computation
###############################################
+use warnings;
use strict;
#constants
diff --git a/scripts/analysis/nontranslated_words.pl b/scripts/analysis/nontranslated_words.pl
index 8fd3c4fbc..b5639429b 100755
--- a/scripts/analysis/nontranslated_words.pl
+++ b/scripts/analysis/nontranslated_words.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# $Id$
# Reads a source and hypothesis file and counts equal tokens. Some of these
diff --git a/scripts/analysis/oov.pl b/scripts/analysis/oov.pl
index 15261c410..c5d6f92e3 100755
--- a/scripts/analysis/oov.pl
+++ b/scripts/analysis/oov.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# Display OOV rate of a test set against a training corpus or a phrase table.
# Ondrej Bojar
diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl
index 82ae57949..4f6560a56 100755
--- a/scripts/analysis/sentence-by-sentence.pl
+++ b/scripts/analysis/sentence-by-sentence.pl
@@ -1,9 +1,10 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
#sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors
#usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.html
+use warnings;
use strict;
use Getopt::Long;
diff --git a/scripts/analysis/sg2dot.perl b/scripts/analysis/sg2dot.perl
index f6a5dff49..b17dfd9fb 100755
--- a/scripts/analysis/sg2dot.perl
+++ b/scripts/analysis/sg2dot.perl
@@ -1,9 +1,10 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
#
# Author : Loic BARRAULT
# Script to convert MOSES searchgraph to DOT format
#
+use warnings;
use strict;
use File::Path;
use File::Basename;
diff --git a/scripts/analysis/show-phrases-used.pl b/scripts/analysis/show-phrases-used.pl
index 5fedf73f1..0a719d207 100755
--- a/scripts/analysis/show-phrases-used.pl
+++ b/scripts/analysis/show-phrases-used.pl
@@ -1,11 +1,13 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
#show-phrases-used: display all source and target phrases for each sentence in a corpus, and give average phrase length used
#usage: show-phrases-used DECODER_OUTFILE > output.html
# where DECODER_OUTFILE is the output of moses with the -T (show alignments) option
+use warnings;
use strict;
+
BEGIN
{
my $wd= `pawd 2>/dev/null`;
diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl
index db51da63d..9f411f3fa 100755
--- a/scripts/analysis/smtgui/filter-phrase-table.pl
+++ b/scripts/analysis/smtgui/filter-phrase-table.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
#by Philipp Koehn, de-augmented by Evan Herbst
@@ -9,6 +9,7 @@
#similar function to filter-model-given-input.pl, but only operates
#on the phrase table and doesn't require that any subdirectories exist
+use warnings;
use strict;
my $MAX_LENGTH = 10;
diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl
index 29e32d271..d1e5c1f67 100755
--- a/scripts/analysis/suspicious_tokenization.pl
+++ b/scripts/analysis/suspicious_tokenization.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# Collects and prints all n-grams that appear in the given corpus both
# tokenized as well as untokenized.
# Ondrej Bojar
diff --git a/scripts/analysis/weight-scan.pl b/scripts/analysis/weight-scan.pl
index 6789c4d6d..7283483e9 100755
--- a/scripts/analysis/weight-scan.pl
+++ b/scripts/analysis/weight-scan.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# runs Moses many times changing the values of one weight, all others fixed
# nbest lists are always produced to allow for comparison of real and
# 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index f9a400eef..57ef4f9d6 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -7,8 +7,15 @@ get-corpus
default-name: corpus/txt
rerun-on-change: input-extension output-extension
template: IN OUT $input-extension $output-extension
+pre-tok-clean
+ in: raw-stem
+ out: pre-tok-cleaned
+ default-name: corpus/pre-tok-cleaned
+ pass-unless: pre-tok-clean
+ template: $pre-tok-clean IN $input-extension $output-extension OUT OUT.lines-retained
+ parallelizable: yes
tokenize
- in: raw-stem
+ in: pre-tok-cleaned
out: tokenized-stem
default-name: corpus/tok
pass-unless: input-tokenizer output-tokenizer
@@ -158,11 +165,18 @@ get-corpus
pass-unless: get-corpus-script
default-name: lm/txt
template: $get-corpus-script > OUT
+use-parallel-corpus
+ in: parallel-corpus-stem
+ out: tokenized-corpus
+ default-name: lm/tok
+ ignore-unless: parallel-corpus-stem
+ template: ln -s IN.$output-extension OUT
tokenize
in: raw-corpus
out: tokenized-corpus
default-name: lm/tok
pass-unless: output-tokenizer
+ ignore-if: parallel-corpus-stem
template: $output-tokenizer < IN > OUT
parallelizable: yes
mock-parse
@@ -185,7 +199,7 @@ lowercase
default-name: lm/lowercased
pass-unless: output-lowercaser
ignore-if: output-truecaser
- only-factor-0: yes
+ #only-factor-0: yes
template: $output-lowercaser < IN > OUT
parallelizable: yes
truecase
@@ -204,8 +218,14 @@ split
default-name: lm/split
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
+strip
+ in: split-corpus
+ out: stripped-corpus
+ default-name: lm/stripped
+ pass-unless: mock-output-parser-lm
+ template: $moses-script-dir/training/strip-xml.perl < IN > OUT
train
- in: split-corpus
+ in: stripped-corpus
out: lm
default-name: lm/lm
ignore-if: rlm-training
@@ -220,7 +240,7 @@ randomize
pass-unless: lm-randomizer
ignore-if: rlm-training
train-randomized
- in: split-corpus
+ in: stripped-corpus
out: rlm
default-name: lm/rlm
ignore-unless: rlm-training
@@ -940,19 +960,34 @@ truecase-reference-devtest
template: $output-truecaser -model IN1.$output-extension < IN > OUT
split-reference
in: truecased-reference SPLITTER:splitter-model
- out: reference
+ out: split-ref
default-name: tuning/reference.split
pass-unless: output-splitter
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
split-reference-devtest
in: truecased-reference-devtest SPLITTER:splitter-model
- out: reference-devtest
+ out: split-ref-devtest
default-name: tuning/reference.devtest.split
pass-unless: output-splitter
ignore-unless: use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
+strip-reference
+ in: split-ref
+ out: reference
+ default-name: tuning/reference.stripped
+ pass-unless: mock-output-parser-references
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
+ template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
+strip-reference-devtest
+ in: split-ref-devtest
+ out: reference
+ default-name: tuning/reference.devtest.stripped
+ pass-unless: mock-output-parser-references
+ ignore-unless: use-mira
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
+ template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
filter
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
out: filtered-dir
@@ -1203,12 +1238,19 @@ mock-parse-reference
template: $mock-output-parser-references < IN > OUT
lowercase-reference
in: mock-parsed-reference
- out: reference
- default-name: evaluation/reference
+ out: lowercased-reference
+ default-name: evaluation/reference.lowercased
pass-unless: output-lowercaser
- pass-if: recaser
+ pass-if: recaser
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
+strip-reference
+ in: lowercased-reference
+ out: reference
+ default-name: evaluation/reference
+ pass-unless: mock-output-parser-references
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
+ template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
wade
in: filtered-dir truecased-input tokenized-reference alignment system-output
out: wade-analysis
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 7a5e81eec..7070a7c9e 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -1,8 +1,9 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# Experiment Management System
# Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
@@ -18,7 +19,18 @@ sub trim($)
my $host = `hostname`; chop($host);
print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`;
-my ($CONFIG_FILE,$EXECUTE,$NO_GRAPH,$CONTINUE,$FINAL_STEP,$FINAL_OUT,$VERBOSE,$IGNORE_TIME,$DELETE_CRASHED,$DELETE_VERSION);
+my ($CONFIG_FILE,
+ $EXECUTE,
+ $NO_GRAPH,
+ $CONTINUE,
+ $FINAL_STEP,
+ $FINAL_OUT,
+ $VERBOSE,
+ $IGNORE_TIME,
+ $DELETE_CRASHED,
+ $DELETE_VERSION
+ );
+
my $SLEEP = 2;
my $META = "$RealBin/experiment.meta";
@@ -3442,7 +3454,7 @@ sub create_step {
$subdir = "lm" if $subdir eq "interpolated-lm";
open(STEP,">$file") or die "Cannot open: $!";
print STEP "#!/bin/bash\n\n";
- print STEP "PATH=\"".$ENV{"PATH"}."\"\n";
+ print STEP "PATH=\"".$ENV{"PATH"}."\"\n";
print STEP "cd $dir\n";
print STEP "echo 'starting at '`date`' on '`hostname`\n";
print STEP "mkdir -p $dir/$subdir\n\n";
diff --git a/scripts/ems/fix-info.perl b/scripts/ems/fix-info.perl
index 924a1a990..8f83d4ccf 100755
--- a/scripts/ems/fix-info.perl
+++ b/scripts/ems/fix-info.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($file,$step) = @ARGV;
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index be5b76a5e..cea2657c9 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/ems/support/build-domain-file-from-subcorpora.perl b/scripts/ems/support/build-domain-file-from-subcorpora.perl
index e85b6ad84..f166c8927 100755
--- a/scripts/ems/support/build-domain-file-from-subcorpora.perl
+++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
# Create domain file from corpora
diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl
index 722f02701..5d9b786ad 100755
--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
# Build necessary files for sparse lexical features
diff --git a/scripts/ems/support/consolidate-training-data.perl b/scripts/ems/support/consolidate-training-data.perl
index 7ee0652d2..170ba999c 100755
--- a/scripts/ems/support/consolidate-training-data.perl
+++ b/scripts/ems/support/consolidate-training-data.perl
@@ -1,7 +1,8 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $
+use warnings;
use strict;
my ($in,$out,$consolidated,@PART) = @ARGV;
diff --git a/scripts/ems/support/generic-multicore-parallelizer.perl b/scripts/ems/support/generic-multicore-parallelizer.perl
index d7e030ad2..e5a12adce 100755
--- a/scripts/ems/support/generic-multicore-parallelizer.perl
+++ b/scripts/ems/support/generic-multicore-parallelizer.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my $cores = 8;
diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl
index fa2d778a2..0b248be7e 100755
--- a/scripts/ems/support/generic-parallelizer.perl
+++ b/scripts/ems/support/generic-parallelizer.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my $jobs = 20;
diff --git a/scripts/ems/support/input-from-sgm.perl b/scripts/ems/support/input-from-sgm.perl
index de888a6f3..223996676 100755
--- a/scripts/ems/support/input-from-sgm.perl
+++ b/scripts/ems/support/input-from-sgm.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
die("ERROR syntax: input-from-sgm.perl < in.sgm > in.txt")
diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl
index 8380f26ca..a2fe62b22 100755
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use IPC::Open3;
use File::Temp qw/tempdir/;
diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl
index 252c32c37..eadca6263 100755
--- a/scripts/ems/support/lmplz-wrapper.perl
+++ b/scripts/ems/support/lmplz-wrapper.perl
@@ -1,27 +1,26 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
-my ($TEXT,$ORDER,$PRUNE,$BIN,$LM,$MEMORY,$TMP,$DISCOUNT_FALLBACK);
+Getopt::Long::config("no_auto_abbrev");
+Getopt::Long::config("pass_through");
+
+
+my ($TEXT,$ORDER,$BIN,$LM);
&GetOptions('text=s' => \$TEXT,
'lm=s' => \$LM,
'bin=s' => \$BIN,
- 'prune=s' => \$PRUNE,
- 'discount_fallback' => \$DISCOUNT_FALLBACK,
- 'T=s' => \$TMP,
- 'S=s' => \$MEMORY,
'order=i' => \$ORDER);
-die("ERROR: specify at least --text CORPUS --arpa LM and --order N!")
- unless defined($TEXT) && defined($LM) && defined($ORDER);
+die("ERROR: specify at least --bin BIN --text CORPUS --lm LM and --order N!")
+ unless defined($BIN) && defined($TEXT) && defined($LM) && defined($ORDER);
-my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM";
-$cmd .= " --prune $PRUNE" if defined($PRUNE);
-$cmd .= " -S $MEMORY" if defined($MEMORY);
-$cmd .= " -T $TMP" if defined($TMP);
-$cmd .= " --discount_fallback" if defined($DISCOUNT_FALLBACK);
+my $settings = join(' ', @ARGV);
+#print STDERR "settngs=$settings \n";
+my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM $settings";
print "exec: $cmd\n";
`$cmd`;
diff --git a/scripts/ems/support/mml-filter.perl b/scripts/ems/support/mml-filter.perl
index f46b132a3..c50725aae 100755
--- a/scripts/ems/support/mml-filter.perl
+++ b/scripts/ems/support/mml-filter.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use FindBin qw($RealBin);
diff --git a/scripts/ems/support/mml-score.perl b/scripts/ems/support/mml-score.perl
index 86ae867f1..449d6a05c 100755
--- a/scripts/ems/support/mml-score.perl
+++ b/scripts/ems/support/mml-score.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
#
diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl
index f68e0163f..1f0548082 100755
--- a/scripts/ems/support/mml-train.perl
+++ b/scripts/ems/support/mml-train.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($indomain_source,,$indomain_target,$outdomain_source,$outdomain_target,$lm_training,$lm_binarizer,$order,$lm_settings,$line_count,$model);
diff --git a/scripts/ems/support/prepare-fast-align.perl b/scripts/ems/support/prepare-fast-align.perl
index 1d95ea972..54c124af0 100755
--- a/scripts/ems/support/prepare-fast-align.perl
+++ b/scripts/ems/support/prepare-fast-align.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($source_file,$target_file,$alignment_factors) = @ARGV;
@@ -22,7 +23,7 @@ while(my $source = <SOURCE>) {
# remove markup
foreach my $line (\$source,\$target) {
- $$line =~ s/\<[^\>]+\>//g;
+ $$line =~ s/\<[^\>]+\>/ /g;
$$line =~ s/\s+/ /g;
$$line =~ s/^ //;
$$line =~ s/ $//;
diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl
index c504c3e75..595226bf1 100755
--- a/scripts/ems/support/reference-from-sgm.perl
+++ b/scripts/ems/support/reference-from-sgm.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
die("ERROR syntax: reference-from-sgm.perl ref src out")
diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl
index 18918c905..d6333f813 100755
--- a/scripts/ems/support/remove-segmentation-markup.perl
+++ b/scripts/ems/support/remove-segmentation-markup.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
$|++;
diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl
index 67963834a..2e433f291 100755
--- a/scripts/ems/support/report-experiment-scores.perl
+++ b/scripts/ems/support/report-experiment-scores.perl
@@ -1,7 +1,8 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $
+use warnings;
use strict;
my $email;
diff --git a/scripts/ems/support/run-command-on-multiple-refsets.perl b/scripts/ems/support/run-command-on-multiple-refsets.perl
index 972f5602d..c3db3c4dc 100755
--- a/scripts/ems/support/run-command-on-multiple-refsets.perl
+++ b/scripts/ems/support/run-command-on-multiple-refsets.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
die("ERROR: syntax: run-command-on-multiple-refsets.perl cmd in out")
diff --git a/scripts/ems/support/run-wade.perl b/scripts/ems/support/run-wade.perl
index 418ff8c97..25cda3bb3 100755
--- a/scripts/ems/support/run-wade.perl
+++ b/scripts/ems/support/run-wade.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use strict;
use File::Temp qw/ tempfile tempdir /;
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index cf7174484..f1af451b3 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# Based on Preprocessor written by Philipp Koehn
@@ -6,6 +6,7 @@ binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
+use warnings;
use FindBin qw($RealBin);
use strict;
diff --git a/scripts/ems/support/submit-grid.perl b/scripts/ems/support/submit-grid.perl
new file mode 100755
index 000000000..9997241e7
--- /dev/null
+++ b/scripts/ems/support/submit-grid.perl
@@ -0,0 +1,62 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use Cwd;
+use FindBin qw($RealBin);
+use Getopt::Long;
+use File::Basename;
+
+
+my $continue = 0;
+my $args = "";
+my $config;
+
+GetOptions("continue=i" => \$continue,
+ "args=s" => \$args,
+ "config=s" => \$config
+ ) or exit 1;
+#print STDERR "args=$args\n";
+
+# create temp run file
+my $gridDir = cwd() ."/grid";
+mkdir $gridDir;
+
+my $runPath = "$gridDir/run.$$";
+print STDERR "runPath=$runPath\n";
+
+open (my $runFile, ">", $runPath);
+
+print $runFile "#!/bin/bash\n";
+print $runFile "#PBS -d" .cwd() ."\n\n";
+
+my $path = $ENV{"PATH"};
+my $user = $ENV{"USER"};
+#print STDERR "path=$path\n";
+
+print $runFile "export PATH=\"$path\"\n\n";
+print $runFile "export PERL5LIB=\"/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1:/home/$user/perl5/lib/perl5\"\n\n";
+
+print $runFile "module load NYUAD/2.0 \n";
+print $runFile "module load gcc python/2.7.9 boost cmake zlib jdk perl expat \n\n";
+
+my $emsDir = dirname($RealBin);
+
+if ($continue) {
+ print $runFile "nice ionice -c 3 $emsDir/experiment.perl -exec -continue=$continue \n\n";
+}
+else {
+ print $runFile "nice ionice -c 3 $emsDir/experiment.perl -exec -config=$config \n\n";
+}
+
+close $runFile;
+
+
+my $cmd = "qsub $args $runPath";
+`$cmd`;
+
+unlink $runFile;
+
+
+
+
diff --git a/scripts/ems/support/substitute-filtered-tables-and-weights.perl b/scripts/ems/support/substitute-filtered-tables-and-weights.perl
index 3a135b44e..681d251c7 100755
--- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl
+++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl
index be1509b8f..e7d9f55f8 100755
--- a/scripts/ems/support/substitute-filtered-tables.perl
+++ b/scripts/ems/support/substitute-filtered-tables.perl
@@ -1,4 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+
+use warnings;
# experiment.perl support script
# get filtered rule and reordering tables and place them into a configuration file
diff --git a/scripts/ems/support/substitute-weights.perl b/scripts/ems/support/substitute-weights.perl
index 9a72dec8a..42357ed1e 100755
--- a/scripts/ems/support/substitute-weights.perl
+++ b/scripts/ems/support/substitute-weights.perl
@@ -1,4 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+
+use warnings;
# experiment.perl support script
# get filtered rule and reordering tables and place them into a configuration file
diff --git a/scripts/ems/support/symmetrize-fast-align.perl b/scripts/ems/support/symmetrize-fast-align.perl
index 40583ee15..90621dea9 100755
--- a/scripts/ems/support/symmetrize-fast-align.perl
+++ b/scripts/ems/support/symmetrize-fast-align.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
die("ERROR: syntax is fastalign2bal.perl direct-alignment inverse-alignment source-file target-file out-stem symmetrization-method symal\n") unless scalar(@ARGV) == 7;
diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl
index e6f7839f1..222623c5b 100755
--- a/scripts/ems/support/thot-lm-wrapper.perl
+++ b/scripts/ems/support/thot-lm-wrapper.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/ems/support/tree-converter-wrapper.perl b/scripts/ems/support/tree-converter-wrapper.perl
index aae55991a..a37654cf1 100755
--- a/scripts/ems/support/tree-converter-wrapper.perl
+++ b/scripts/ems/support/tree-converter-wrapper.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl
use warnings;
use strict;
diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl
index beeca6cdd..28708a62a 100755
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($language,$src,$system) = @ARGV;
diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php
index a64d5977f..00bb9e15f 100644
--- a/scripts/ems/web/analysis.php
+++ b/scripts/ems/web/analysis.php
@@ -1261,8 +1261,8 @@ function input_annotation($sentence,$input,$segmentation,$filter) {
for($j=$from;$j<=$to;$j++) {
if ($j>$from) { $phrase .= " "; }
$phrase .= $word[$j];
- $highlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='#ffff80';";
- $lowlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
+ $highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';";
+ $lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
}
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($phrase)."');\"":"").">";
}
@@ -1443,10 +1443,10 @@ function biconcor($query) {
$sentence = $_GET['sentence'];
$biconcor = get_biconcor_version($dir,$set,$id);
print "<center>
-<form method=get id=\"BiconcorForm\">
+<form method=\"get\" id=\"BiconcorForm\" onsubmit=\"return false;\">
<img src=\"close.gif\" width=17 height=17 onClick=\"close_biconcor($sentence);\">
<input width=20 id=\"BiconcorQuery\" value=\"$query\">
-<input type=submit onclick=\"show_biconcor($sentence,encodeBase64(document.getElementById('BiconcorQuery').value));\" value=\"look up\">
+<input type=submit onclick=\"show_biconcor($sentence,Base64.encode(document.getElementById('BiconcorQuery').value));\" value=\"look up\">
</form>
<div class=\"biconcor-content\">";
$cmd = "./biconcor -html -l $dir/model/biconcor.$biconcor -Q ".base64_encode($query)." 2>/dev/null";
diff --git a/scripts/ems/web/base64.js b/scripts/ems/web/base64.js
index e0e94d765..67fd9ad8d 100644
--- a/scripts/ems/web/base64.js
+++ b/scripts/ems/web/base64.js
@@ -1,108 +1,193 @@
-var END_OF_INPUT = -1;
+/*
+ * $Id: base64.js,v 2.15 2014/04/05 12:58:57 dankogai Exp dankogai $
+ *
+ * Licensed under the MIT license.
+ * http://opensource.org/licenses/mit-license
+ *
+ * References:
+ * http://en.wikipedia.org/wiki/Base64
+ */
-var base64Chars = new Array(
- 'A','B','C','D','E','F','G','H',
- 'I','J','K','L','M','N','O','P',
- 'Q','R','S','T','U','V','W','X',
- 'Y','Z','a','b','c','d','e','f',
- 'g','h','i','j','k','l','m','n',
- 'o','p','q','r','s','t','u','v',
- 'w','x','y','z','0','1','2','3',
- '4','5','6','7','8','9','+','/'
-);
-
-var reverseBase64Chars = new Array();
-for (var i=0; i < base64Chars.length; i++){
- reverseBase64Chars[base64Chars[i]] = i;
-}
-
-var base64Str;
-var base64Count;
-function setBase64Str(str){
- base64Str = str;
- base64Count = 0;
-}
-function readBase64(){
- if (!base64Str) return END_OF_INPUT;
- if (base64Count >= base64Str.length) return END_OF_INPUT;
- var c = base64Str.charCodeAt(base64Count) & 0xff;
- base64Count++;
- return c;
-}
-function encodeBase64(str){
- setBase64Str(str);
- var result = '';
- var inBuffer = new Array(3);
- var lineCount = 0;
- var done = false;
- while (!done && (inBuffer[0] = readBase64()) != END_OF_INPUT){
- inBuffer[1] = readBase64();
- inBuffer[2] = readBase64();
- result += (base64Chars[ inBuffer[0] >> 2 ]);
- if (inBuffer[1] != END_OF_INPUT){
- result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30) | (inBuffer[1] >> 4) ]);
- if (inBuffer[2] != END_OF_INPUT){
- result += (base64Chars [((inBuffer[1] << 2) & 0x3c) | (inBuffer[2] >> 6) ]);
- result += (base64Chars [inBuffer[2] & 0x3F]);
- } else {
- result += (base64Chars [((inBuffer[1] << 2) & 0x3c)]);
- result += ('=');
- done = true;
- }
+(function(global) {
+ 'use strict';
+ // existing version for noConflict()
+ var _Base64 = global.Base64;
+ var version = "2.1.7";
+ // if node.js, we use Buffer
+ var buffer;
+ if (typeof module !== 'undefined' && module.exports) {
+ buffer = require('buffer').Buffer;
+ }
+ // constants
+ var b64chars
+ = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
+ var b64tab = function(bin) {
+ var t = {};
+ for (var i = 0, l = bin.length; i < l; i++) t[bin.charAt(i)] = i;
+ return t;
+ }(b64chars);
+ var fromCharCode = String.fromCharCode;
+ // encoder stuff
+ var cb_utob = function(c) {
+ if (c.length < 2) {
+ var cc = c.charCodeAt(0);
+ return cc < 0x80 ? c
+ : cc < 0x800 ? (fromCharCode(0xc0 | (cc >>> 6))
+ + fromCharCode(0x80 | (cc & 0x3f)))
+ : (fromCharCode(0xe0 | ((cc >>> 12) & 0x0f))
+ + fromCharCode(0x80 | ((cc >>> 6) & 0x3f))
+ + fromCharCode(0x80 | ( cc & 0x3f)));
} else {
- result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30)]);
- result += ('=');
- result += ('=');
- done = true;
- }
- lineCount += 4;
- if (lineCount >= 76){
- result += ('\n');
- lineCount = 0;
+ var cc = 0x10000
+ + (c.charCodeAt(0) - 0xD800) * 0x400
+ + (c.charCodeAt(1) - 0xDC00);
+ return (fromCharCode(0xf0 | ((cc >>> 18) & 0x07))
+ + fromCharCode(0x80 | ((cc >>> 12) & 0x3f))
+ + fromCharCode(0x80 | ((cc >>> 6) & 0x3f))
+ + fromCharCode(0x80 | ( cc & 0x3f)));
}
+ };
+ var re_utob = /[\uD800-\uDBFF][\uDC00-\uDFFFF]|[^\x00-\x7F]/g;
+ var utob = function(u) {
+ return u.replace(re_utob, cb_utob);
+ };
+ var cb_encode = function(ccc) {
+ var padlen = [0, 2, 1][ccc.length % 3],
+ ord = ccc.charCodeAt(0) << 16
+ | ((ccc.length > 1 ? ccc.charCodeAt(1) : 0) << 8)
+ | ((ccc.length > 2 ? ccc.charCodeAt(2) : 0)),
+ chars = [
+ b64chars.charAt( ord >>> 18),
+ b64chars.charAt((ord >>> 12) & 63),
+ padlen >= 2 ? '=' : b64chars.charAt((ord >>> 6) & 63),
+ padlen >= 1 ? '=' : b64chars.charAt(ord & 63)
+ ];
+ return chars.join('');
+ };
+ var btoa = global.btoa ? function(b) {
+ return global.btoa(b);
+ } : function(b) {
+ return b.replace(/[\s\S]{1,3}/g, cb_encode);
+ };
+ var _encode = buffer ? function (u) {
+ return (u.constructor === buffer.constructor ? u : new buffer(u))
+ .toString('base64')
}
- return result;
-}
-function readReverseBase64(){
- if (!base64Str) return END_OF_INPUT;
- while (true){
- if (base64Count >= base64Str.length) return END_OF_INPUT;
- var nextCharacter = base64Str.charAt(base64Count);
- base64Count++;
- if (reverseBase64Chars[nextCharacter]){
- return reverseBase64Chars[nextCharacter];
+ : function (u) { return btoa(utob(u)) }
+ ;
+ var encode = function(u, urisafe) {
+ return !urisafe
+ ? _encode(String(u))
+ : _encode(String(u)).replace(/[+\/]/g, function(m0) {
+ return m0 == '+' ? '-' : '_';
+ }).replace(/=/g, '');
+ };
+ var encodeURI = function(u) { return encode(u, true) };
+ // decoder stuff
+ var re_btou = new RegExp([
+ '[\xC0-\xDF][\x80-\xBF]',
+ '[\xE0-\xEF][\x80-\xBF]{2}',
+ '[\xF0-\xF7][\x80-\xBF]{3}'
+ ].join('|'), 'g');
+ var cb_btou = function(cccc) {
+ switch(cccc.length) {
+ case 4:
+ var cp = ((0x07 & cccc.charCodeAt(0)) << 18)
+ | ((0x3f & cccc.charCodeAt(1)) << 12)
+ | ((0x3f & cccc.charCodeAt(2)) << 6)
+ | (0x3f & cccc.charCodeAt(3)),
+ offset = cp - 0x10000;
+ return (fromCharCode((offset >>> 10) + 0xD800)
+ + fromCharCode((offset & 0x3FF) + 0xDC00));
+ case 3:
+ return fromCharCode(
+ ((0x0f & cccc.charCodeAt(0)) << 12)
+ | ((0x3f & cccc.charCodeAt(1)) << 6)
+ | (0x3f & cccc.charCodeAt(2))
+ );
+ default:
+ return fromCharCode(
+ ((0x1f & cccc.charCodeAt(0)) << 6)
+ | (0x3f & cccc.charCodeAt(1))
+ );
}
- if (nextCharacter == 'A') return 0;
+ };
+ var btou = function(b) {
+ return b.replace(re_btou, cb_btou);
+ };
+ var cb_decode = function(cccc) {
+ var len = cccc.length,
+ padlen = len % 4,
+ n = (len > 0 ? b64tab[cccc.charAt(0)] << 18 : 0)
+ | (len > 1 ? b64tab[cccc.charAt(1)] << 12 : 0)
+ | (len > 2 ? b64tab[cccc.charAt(2)] << 6 : 0)
+ | (len > 3 ? b64tab[cccc.charAt(3)] : 0),
+ chars = [
+ fromCharCode( n >>> 16),
+ fromCharCode((n >>> 8) & 0xff),
+ fromCharCode( n & 0xff)
+ ];
+ chars.length -= [0, 0, 2, 1][padlen];
+ return chars.join('');
+ };
+ var atob = global.atob ? function(a) {
+ return global.atob(a);
+ } : function(a){
+ return a.replace(/[\s\S]{1,4}/g, cb_decode);
+ };
+ var _decode = buffer ? function(a) {
+ return (a.constructor === buffer.constructor
+ ? a : new buffer(a, 'base64')).toString();
}
- return END_OF_INPUT;
-}
-function ntos(n){
- n=n.toString(16);
- if (n.length == 1) n="0"+n;
- n="%"+n;
- return unescape(n);
-}
-
-function decodeBase64(str){
- setBase64Str(str);
- var result = "";
- var inBuffer = new Array(4);
- var done = false;
- while (!done && (inBuffer[0] = readReverseBase64()) != END_OF_INPUT
- && (inBuffer[1] = readReverseBase64()) != END_OF_INPUT){
- inBuffer[2] = readReverseBase64();
- inBuffer[3] = readReverseBase64();
- result += ntos((((inBuffer[0] << 2) & 0xff)| inBuffer[1] >> 4));
- if (inBuffer[2] != END_OF_INPUT){
- result += ntos((((inBuffer[1] << 4) & 0xff)| inBuffer[2] >> 2));
- if (inBuffer[3] != END_OF_INPUT){
- result += ntos((((inBuffer[2] << 6) & 0xff) | inBuffer[3]));
- } else {
- done = true;
- }
- } else {
- done = true;
- }
+ : function(a) { return btou(atob(a)) };
+ var decode = function(a){
+ return _decode(
+ String(a).replace(/[-_]/g, function(m0) { return m0 == '-' ? '+' : '/' })
+ .replace(/[^A-Za-z0-9\+\/]/g, '')
+ );
+ };
+ var noConflict = function() {
+ var Base64 = global.Base64;
+ global.Base64 = _Base64;
+ return Base64;
+ };
+ // export Base64
+ global.Base64 = {
+ VERSION: version,
+ atob: atob,
+ btoa: btoa,
+ fromBase64: decode,
+ toBase64: encode,
+ utob: utob,
+ encode: encode,
+ encodeURI: encodeURI,
+ btou: btou,
+ decode: decode,
+ noConflict: noConflict
+ };
+ // if ES5 is available, make Base64.extendString() available
+ if (typeof Object.defineProperty === 'function') {
+ var noEnum = function(v){
+ return {value:v,enumerable:false,writable:true,configurable:true};
+ };
+ global.Base64.extendString = function () {
+ Object.defineProperty(
+ String.prototype, 'fromBase64', noEnum(function () {
+ return decode(this)
+ }));
+ Object.defineProperty(
+ String.prototype, 'toBase64', noEnum(function (urisafe) {
+ return encode(this, urisafe)
+ }));
+ Object.defineProperty(
+ String.prototype, 'toBase64URI', noEnum(function () {
+ return encode(this, true)
+ }));
+ };
}
- return result;
+ // that's it!
+})(this);
+
+if (this['Meteor']) {
+ Base64 = global.Base64; // for normal export in Meteor.js
}
diff --git a/scripts/ems/web/bilingual-concordance.css b/scripts/ems/web/bilingual-concordance.css
index e232337d2..4648a21dd 100644
--- a/scripts/ems/web/bilingual-concordance.css
+++ b/scripts/ems/web/bilingual-concordance.css
@@ -93,5 +93,6 @@ span.mismatch_aligned {
td.pp_more {
font-size: 70%;
+ color: navy;
text-align: center;
}
diff --git a/scripts/ems/web/index.php b/scripts/ems/web/index.php
index 6b785cf3f..d216b114a 100644
--- a/scripts/ems/web/index.php
+++ b/scripts/ems/web/index.php
@@ -8,7 +8,7 @@ require("diff.php");
require("sgviz.php");
function head($title) {
- print '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+ print '<!DOCTYPE html>
<html><head><title>'.$title.'</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<script language="javascript" src="javascripts/prototype.js"></script>
diff --git a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc
index 21f8c8cf6..57f78eb53 100644
--- a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc
+++ b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc
@@ -32,8 +32,8 @@ in a directory of your website, e.g. /javascripts.
Now, you can include the scripts by adding the following
tags to the HEAD section of your HTML pages:
- <script src="/javascripts/prototype.js" type="text/javascript"></script>
- <script src="/javascripts/scriptaculous.js" type="text/javascript"></script>
+ <script src="javascripts/prototype.js" type="text/javascript"></script>
+ <script src="javascripts/scriptaculous.js" type="text/javascript"></script>
scriptaculous.js will automatically load the other files of the
script.aculo.us distribution in, provided they are accessible
@@ -56,4 +56,4 @@ the sources of the examples provided.
== License
script.aculo.us is licensed under the terms of the MIT License,
-see the included MIT-LICENSE file. \ No newline at end of file
+see the included MIT-LICENSE file.
diff --git a/scripts/ems/web/overview.php b/scripts/ems/web/overview.php
index e56ed6f08..ce0434bb8 100644
--- a/scripts/ems/web/overview.php
+++ b/scripts/ems/web/overview.php
@@ -1,6 +1,5 @@
<?php
-date_default_timezone_set('Europe/London');
function setup() {
$setup = file("setup");
@@ -13,7 +12,7 @@ function setup() {
print "<TR><TD><A HREF=\"?setup=$dir[0]\">$dir[0]</A></TD><TD>$dir[1]</TD><TD>$dir[2]</TD><TD>$dir[3]</TD></TR>\n";
}
print "</TABLE>\n";
- print "<P>To add experiment, edit /fs/thor4/html/experiment/setup";
+ print "<p>To add experiment, edit the \"setup\" file.</p>";
}
function overview() {
@@ -26,7 +25,7 @@ function overview() {
head("Task: $task ($user)");
print "<a href=\"http://www.statmt.org/wiki/?n=Experiment.$setup\">Wiki Notes</a>";
- print " &nbsp; &nbsp; | &nbsp; &nbsp; <a href=\"/\">Overview of experiments</a> &nbsp; &nbsp; | &nbsp; &nbsp; <code>$dir</code><p>";
+ print " &nbsp; &nbsp; | &nbsp; &nbsp; <a href=\"?\">Overview of experiments</a> &nbsp; &nbsp; | &nbsp; &nbsp; <code>$dir</code><p>";
reset($experiment);
print "<form action=\"\" method=get>\n";
diff --git a/scripts/ems/web/progress.perl b/scripts/ems/web/progress.perl
index 6e26a7881..fd742e410 100755
--- a/scripts/ems/web/progress.perl
+++ b/scripts/ems/web/progress.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Date::Parse;
diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl
index 4adc97ca2..80a1b3120 100755
--- a/scripts/fuzzy-match/create_xml.perl
+++ b/scripts/fuzzy-match/create_xml.perl
@@ -1,8 +1,9 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
binmode( STDIN, ":utf8" );
binmode( STDOUT, ":utf8" );
+use warnings;
use strict;
use FindBin qw($RealBin);
use File::Basename;
diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl
index beca70eb0..c0b25f519 100755
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/generic/extract-factors.pl b/scripts/generic/extract-factors.pl
index fdd30082f..56c719051 100755
--- a/scripts/generic/extract-factors.pl
+++ b/scripts/generic/extract-factors.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
#extract-factors.pl: extract only the desired factors from a factored corpus
@@ -6,6 +6,7 @@
#factor indices start at 0
#factor indices too large ought to be ignored
+use warnings;
use strict;
my ($filename, @factors) = @ARGV;
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index 71032ce1a..2b02fa869 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -1,8 +1,9 @@
-#! /usr/bin/perl -w
+#!/usr/bin/env perl
# example
# ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput
+use warnings;
use strict;
use File::Basename;
@@ -32,8 +33,8 @@ my $glueFile;
my $phraseOrientation = 0;
my $phraseOrientationPriorsFile;
-my $GZIP_EXEC; # = which("pigz");
-if(-f "/usr/bin/pigz") {
+my $GZIP_EXEC;
+if(`which pigz`) {
$GZIP_EXEC = 'pigz';
}
else {
diff --git a/scripts/generic/fsa2fsal.pl b/scripts/generic/fsa2fsal.pl
index d13c87310..50bff1404 100755
--- a/scripts/generic/fsa2fsal.pl
+++ b/scripts/generic/fsa2fsal.pl
@@ -1,10 +1,11 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl
# A very simple script that converts fsa format (openfst lattices) to the same
# thing represented one sentence per line. It uses '|||' to delimit columns and
# ' ' to delimit nodes (i.e. original lines).
# Some rudimentary sanity checks are done on the fly.
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+use warnings;
use strict;
my $errs = 0;
diff --git a/scripts/generic/fsa2plf.pl b/scripts/generic/fsa2plf.pl
index debf8b60d..4e7454a9f 100755
--- a/scripts/generic/fsa2plf.pl
+++ b/scripts/generic/fsa2plf.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# Converts AT&T FSA format to 'python lattice format'.
# Note that the input FSA needs to be epsilon-free and topologically sorted.
# This script checks for topological sortedness.
@@ -8,6 +8,7 @@
# Note that the output format may not contain any spaces.
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+use warnings;
use strict;
use Getopt::Long;
diff --git a/scripts/generic/fsal2fsa.pl b/scripts/generic/fsal2fsa.pl
index 36aed0ecd..d1aa461ac 100755
--- a/scripts/generic/fsal2fsa.pl
+++ b/scripts/generic/fsal2fsa.pl
@@ -1,7 +1,8 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl
# A very simple script that converts fsal back to fsa format (openfst lattices)
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+use warnings;
use strict;
while (<>) {
diff --git a/scripts/generic/generic-parallel.perl b/scripts/generic/generic-parallel.perl
index 2becba31c..653912c5c 100755
--- a/scripts/generic/generic-parallel.perl
+++ b/scripts/generic/generic-parallel.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/generic/giza-parallel.perl b/scripts/generic/giza-parallel.perl
index 55192af74..8793d3d8e 100755
--- a/scripts/generic/giza-parallel.perl
+++ b/scripts/generic/giza-parallel.perl
@@ -1,8 +1,9 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl
# example
# ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align
+use warnings;
use strict;
use File::Basename;
diff --git a/scripts/generic/lopar2pos.pl b/scripts/generic/lopar2pos.pl
index a2b6e93b4..c75069135 100755
--- a/scripts/generic/lopar2pos.pl
+++ b/scripts/generic/lopar2pos.pl
@@ -1,9 +1,11 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
#lopar2pos: extract POSs from LOPAR output
#usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos
+use warnings;
+
my $infilename = shift @ARGV;
open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n";
while(my $line = <INFILE>)
diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl
index 47c7551b3..7c0f56c70 100755
--- a/scripts/generic/moses-parallel.pl
+++ b/scripts/generic/moses-parallel.pl
@@ -1,4 +1,4 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl
# $Id$
#######################
@@ -15,6 +15,7 @@
# added checks for existence of decoder and configuration file
# 26 Jul 2006 fix a bug related to the use of absolute path for srcfile and nbestfile
+use warnings;
use strict;
#######################
diff --git a/scripts/generic/mteval-v12.pl b/scripts/generic/mteval-v12.pl
index 1010eabfd..360376242 100755
--- a/scripts/generic/mteval-v12.pl
+++ b/scripts/generic/mteval-v12.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
use Encode;
diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl
index f1f8f9ef6..453c03e19 100755
--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
use warnings;
use strict;
diff --git a/scripts/generic/multi-bleu.perl b/scripts/generic/multi-bleu.perl
index 94da1504f..2f44d419f 100755
--- a/scripts/generic/multi-bleu.perl
+++ b/scripts/generic/multi-bleu.perl
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
my $lowercase = 0;
diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl
index a5e5f5a6b..ea56927ac 100755
--- a/scripts/generic/ph_numbers.perl
+++ b/scripts/generic/ph_numbers.perl
@@ -1,4 +1,5 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+
package ph_numbers;
# Script to recognize and replace numbers in Moses training corpora
@@ -6,6 +7,7 @@ package ph_numbers;
#
# (c) 2013 TAUS
+use warnings;
use strict;
run() unless caller();
diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl
index e34c84a74..622323bdb 100755
--- a/scripts/generic/qsub-wrapper.pl
+++ b/scripts/generic/qsub-wrapper.pl
@@ -1,6 +1,7 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
#######################
diff --git a/scripts/generic/reverse-alignment.perl b/scripts/generic/reverse-alignment.perl
index e19ddc9e5..d00140c74 100755
--- a/scripts/generic/reverse-alignment.perl
+++ b/scripts/generic/reverse-alignment.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my $line;
diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl
index a5aa4fb4d..9e5ee0025 100755
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@@ -1,9 +1,10 @@
-#! /usr/bin/perl -w
+#!/usr/bin/env perl
# example
# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0
# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f --Inverse 1
+use warnings;
use strict;
use File::Basename;
@@ -13,8 +14,8 @@ sub GetSourcePhrase($);
sub NumStr($);
sub CutContextFile($$$);
-my $GZIP_EXEC; # = which("pigz");
-if(-f "/usr/bin/pigz") {
+my $GZIP_EXEC;
+if(`which pigz`) {
$GZIP_EXEC = 'pigz';
}
else {
diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl
index 40a61302a..95513b608 100755
--- a/scripts/generic/strip-xml.perl
+++ b/scripts/generic/strip-xml.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
while (my $line = <STDIN>) {
diff --git a/scripts/generic/trainlm-irst2.perl b/scripts/generic/trainlm-irst2.perl
index 8ad53e880..596143386 100755
--- a/scripts/generic/trainlm-irst2.perl
+++ b/scripts/generic/trainlm-irst2.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# Compatible with sri LM-creating script, eg.
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
@@ -10,6 +10,7 @@
# irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
# Set smoothing method in settings, if different from modified Kneser-Ney
+use warnings;
use strict;
use FindBin qw($RealBin);
use Getopt::Long;
diff --git a/scripts/generic/trainlm-lmplz.perl b/scripts/generic/trainlm-lmplz.perl
deleted file mode 100755
index f9bc0d0da..000000000
--- a/scripts/generic/trainlm-lmplz.perl
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/perl -w
-
-# Compatible with sri LM-creating script, eg.
-# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
-# To use it in the EMS, add this to the [LM] section
-# lm-training = "$moses-script-dir/generic/trainlm-lmplz.perl -lmplz $lmplz"
-# settings = "-T $working-dir/tmp -S 10G"
-# Also, make sure that $lmplz is defined (in the [LM] or [GENERAL] section.
-# It should point to the binary file
-# lmplz = /home/waziz/workspace/github/moses/bin/lmplz
-
-use strict;
-use FindBin qw($RealBin);
-use Getopt::Long qw/GetOptionsFromArray/;
-#use Getopt::Long;
-Getopt::Long::Configure("pass_through", "no_ignore_case");
-
-my $order = 3; # order of language model (default trigram)
-my $corpus; # input text data
-my $lm; # generated language model
-my $lmplz; # bin directory of IRSTLM
-my $help = 0;
-
-my @optconfig = (
- "-order=s" => \$order,
- "-text=s" => \$corpus,
- "-lm=s" => \$lm,
- "-lmplz=s" => \$lmplz,
-);
-
-GetOptionsFromArray(\@ARGV, @optconfig);
-die("ERROR: please set text") unless defined($corpus);
-die("ERROR: please set lm") unless defined($lm);
-die("ERROR: please set lmplz") unless defined($lmplz);
-
-my $settings = join(' ', @ARGV);
-my $cmd = "$lmplz --order $order $settings < $corpus > $lm";
-
-print STDERR "EXECUTING $cmd\n";
-`$cmd`;
diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl
index 5aa7d4f85..130afd56b 100755
--- a/scripts/other/beautify.perl
+++ b/scripts/other/beautify.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use strict;
use File::Basename;
use FindBin qw($RealBin);
diff --git a/scripts/other/convert-pt.perl b/scripts/other/convert-pt.perl
index fa35b4490..f530a447a 100755
--- a/scripts/other/convert-pt.perl
+++ b/scripts/other/convert-pt.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# $Id$
# convert a phrase-table with alignment in Moses' dead-end format
diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl
index 2a4f51c89..08316c95b 100755
--- a/scripts/other/delete-scores.perl
+++ b/scripts/other/delete-scores.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/other/get_many_translations_from_google.perl b/scripts/other/get_many_translations_from_google.perl
index c9feae9ca..512b84e36 100755
--- a/scripts/other/get_many_translations_from_google.perl
+++ b/scripts/other/get_many_translations_from_google.perl
@@ -1,10 +1,12 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+
# Uses Google AJAX API to collect many translations, i.e. create a parallel
# corpus of Google translations.
# Expects one sentence per line, not tokenized!
#
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+use warnings;
use strict;
use Getopt::Long;
use CGI;
diff --git a/scripts/other/retain-lines.perl b/scripts/other/retain-lines.perl
index 6f7c517c2..b865e1af7 100755
--- a/scripts/other/retain-lines.perl
+++ b/scripts/other/retain-lines.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
#retain lines in clean.lines-retained.1
use strict;
diff --git a/scripts/other/translate_by_microsoft_bing.perl b/scripts/other/translate_by_microsoft_bing.perl
index 50e9a12d2..ad7a9c3b7 100755
--- a/scripts/other/translate_by_microsoft_bing.perl
+++ b/scripts/other/translate_by_microsoft_bing.perl
@@ -1,4 +1,5 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+
# Script implemented by Pranava Swaroop Madhyastha (a student at Charles
# University, UFAL)
diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl
index 012c143ac..549cd8abe 100755
--- a/scripts/recaser/detruecase.perl
+++ b/scripts/recaser/detruecase.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl
index 2858cda61..3ba83712a 100755
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl
index ad75af068..87a720f6e 100755
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
use FindBin qw($Bin);
use Getopt::Long "GetOptions";
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index 59a83ec91..b653a8ca5 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
@@ -8,6 +8,7 @@
# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
#
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl
index ca0cf44ee..373aa509f 100755
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@@ -1,6 +1,8 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
+
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/regression-testing/compare-results.pl b/scripts/regression-testing/compare-results.pl
index 744334d29..df14d444f 100755
--- a/scripts/regression-testing/compare-results.pl
+++ b/scripts/regression-testing/compare-results.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($results, $truth) = @ARGV;
diff --git a/scripts/regression-testing/create_localized_moses_ini.pl b/scripts/regression-testing/create_localized_moses_ini.pl
index b102a4ed4..612a39e82 100755
--- a/scripts/regression-testing/create_localized_moses_ini.pl
+++ b/scripts/regression-testing/create_localized_moses_ini.pl
@@ -1,5 +1,6 @@
-#! /usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use MosesScriptsRegressionTesting;
diff --git a/scripts/regression-testing/modify-pars.pl b/scripts/regression-testing/modify-pars.pl
index 23576873b..5ad2514a4 100755
--- a/scripts/regression-testing/modify-pars.pl
+++ b/scripts/regression-testing/modify-pars.pl
@@ -1,5 +1,6 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use strict;
my $argv=join(" ",@ARGV);
diff --git a/scripts/regression-testing/moses-virtual.pl b/scripts/regression-testing/moses-virtual.pl
index 4bb852242..41ddd6b13 100755
--- a/scripts/regression-testing/moses-virtual.pl
+++ b/scripts/regression-testing/moses-virtual.pl
@@ -1,5 +1,6 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use strict;
my %opt = ();
diff --git a/scripts/regression-testing/run-single-test.pl b/scripts/regression-testing/run-single-test.pl
index 0c5efa3a7..bb66e96f6 100755
--- a/scripts/regression-testing/run-single-test.pl
+++ b/scripts/regression-testing/run-single-test.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use MosesScriptsRegressionTesting;
diff --git a/scripts/regression-testing/run-test-suite.pl b/scripts/regression-testing/run-test-suite.pl
index 7cdfc21d9..8ae9ec60f 100755
--- a/scripts/regression-testing/run-test-suite.pl
+++ b/scripts/regression-testing/run-test-suite.pl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use Getopt::Long;
diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl
index ca4e8a1b3..0e73a7718 100755
--- a/scripts/tokenizer/deescape-special-chars-PTB.perl
+++ b/scripts/tokenizer/deescape-special-chars-PTB.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl
index 7dc6bc539..076d1e62f 100755
--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index a8de7e86e..7874d5d04 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
# Sample De-Tokenizer
@@ -7,6 +7,8 @@
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
+
+use warnings;
use strict;
use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
@@ -36,7 +38,7 @@ if ($HELP) {
exit;
}
-if ($language !~ /^(cs|en|fr|it)$/) {
+if ($language !~ /^(cs|en|fr|it|fi)$/) {
print STDERR "Warning: No built-in rules for language $language.\n"
}
@@ -176,6 +178,11 @@ sub detokenize {
}
+ } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) {
+ # Finnish : without intervening space if followed by case suffix
+ # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
+ $text=$text. lc $words[$i];
+ $prependSpace = " ";
} else {
$text=$text.$prependSpace.$words[$i];
$prependSpace = " ";
diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl
index 89afdb0e3..e94b91744 100755
--- a/scripts/tokenizer/escape-special-chars.perl
+++ b/scripts/tokenizer/escape-special-chars.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl
index c30e029b9..9ee307bc2 100755
--- a/scripts/tokenizer/lowercase.perl
+++ b/scripts/tokenizer/lowercase.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
binmode(STDIN, ":utf8");
diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl
index c679ab2a7..db8f9c60e 100755
--- a/scripts/tokenizer/normalize-punctuation.perl
+++ b/scripts/tokenizer/normalize-punctuation.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my $language = "en";
diff --git a/scripts/tokenizer/pre-tok-clean.perl b/scripts/tokenizer/pre-tok-clean.perl
new file mode 100755
index 000000000..900e992ee
--- /dev/null
+++ b/scripts/tokenizer/pre-tok-clean.perl
@@ -0,0 +1,46 @@
+#!/usr/bin/env perl
+
+use strict;
+
+my $minChars = $ARGV[0];
+my $maxChars = $ARGV[1];
+my $inputStem = $ARGV[2];
+my $source = $ARGV[3];
+my $target = $ARGV[4];
+my $outputStem = $ARGV[5];
+my $linesRetained = $ARGV[6];
+
+open(IN_SOURCE, "<:encoding(UTF-8)", "$inputStem.$source") or die "cannot open $inputStem.$source";
+open(IN_TARGET, "<:encoding(UTF-8)", "$inputStem.$target") or die "cannot open $inputStem.$target";
+
+open(OUT_SOURCE, ">:encoding(UTF-8)", "$outputStem.$source") or die "cannot open $outputStem.$source";
+open(OUT_TARGET, ">:encoding(UTF-8)", "$outputStem.$target") or die "cannot open $outputStem.$target";
+
+open(LINE_RETAINED, ">:encoding(UTF-8)", "$linesRetained");
+
+my $lineNum = 0;
+while (my $lineSource = <IN_SOURCE>) {
+ ++$lineNum;
+ #print STDERR "$lineNum ";
+
+ chomp($lineSource);
+ my $lineTarget = <IN_TARGET>;
+ chomp($lineTarget);
+
+ my $lenSource = length($lineSource);
+ my $lenTarget = length($lineTarget);
+
+ if ($lenSource < $minChars || $lenSource > $maxChars
+ || $lenTarget < $minChars || $lenTarget > $maxChars) {
+ # do nothing
+ }
+ else {
+ print OUT_SOURCE "$lineSource\n";
+ print OUT_TARGET "$lineTarget\n";
+ print LINE_RETAINED "$lineNum\n";
+ }
+}
+
+close(OUT_SOURCE);
+close(OUT_SOURCE);
+close(LINE_RETAINED);
diff --git a/scripts/tokenizer/pre-tokenizer.perl b/scripts/tokenizer/pre-tokenizer.perl
index cb6218716..499671b44 100755
--- a/scripts/tokenizer/pre-tokenizer.perl
+++ b/scripts/tokenizer/pre-tokenizer.perl
@@ -1,8 +1,10 @@
-#!/usr/bin/perl -W
+#!/usr/bin/env perl
+
# script for preprocessing language data prior to tokenization
# Start by Ulrich Germann, after noticing systematic preprocessing errors
# in some of the English Europarl data.
+use warnings;
use strict;
use Getopt::Std;
diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py
new file mode 100644
index 000000000..76736da5c
--- /dev/null
+++ b/scripts/tokenizer/pre_tokenize_cleaning.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python -*- coding: utf-8 -*-
+
+"""
+The Gacha filter cleans out sentence pairs that have global character mean
+lower than a certain threshold.
+
+Use this cleaner to produce low quantity of high quality sentence pairs.
+
+It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
+WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
+(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
+
+This is inspired by the global character mean that is used in the Gale-Church
+algorithm (Gale aand Church, 1993), the c variable in:
+
+ delta = (l2-l1*c)/math.sqrt(l1*s2)
+
+where:
+ - l1 = len(source_sentence)
+ - l2 = len(target_sentence)
+ - c = global mean, i.e. #char in source corpus / #char in target corpus
+ - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1)
+
+(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
+"""
+
+import io, subprocess
+
+red = '\033[01;31m'
+native = '\033[m'
+
+def err_msg(txt):
+ return red+txt+native
+
+def num_char(filename):
+ return float(subprocess.Popen(["wc", "-m", filename],
+ stdout=subprocess.PIPE).stdout.read().split()[0])
+
+def gacha_mean(sourcefile, targetfile):
+ """
+ Counts the global character mean between source and target language as
+ in Gale-Church (1993)
+ """
+ sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
+ c = num_char(sourcefile) / num_char(targetfile)
+ sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
+ sys.stderr.write(err_msg('Filtering starts ...\n'))
+ return c
+
+def main(sourcefile, targetfile, threshold=0.2):
+ # Calculates Gacha mean.
+ c = gacha_mean(sourcefile, targetfile)
+ # Calculates lower and upperbound for filtering
+ threshold = float(threshold)
+ lowerbound = (1-threshold) * c
+ upperbound = (1+threshold) * c
+
+ # Start filtering sentences.
+ with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
+ io.open(targetfile, 'r', encoding='utf8') as trgfin:
+ for s, t in zip(srcfin, trgfin):
+ if lowerbound < len(s) / float(len(t)) < upperbound:
+ print(u"{}\t{}\n".format(s.strip(),t.strip()))
+
+if __name__ == '__main__':
+ import sys
+ if len(sys.argv) not in range(3,5):
+ usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n'
+ % sys.argv[0])
+
+ example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de '
+ '~/Europarl.de-en.en 0.4\n'
+ % sys.argv[0])
+ sys.stderr.write(usage_msg)
+ sys.stderr.write(example_msg)
+ sys.exit(1)
+
+ main(*sys.argv[1:])
diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl
index 2a7bec07b..2b90dfd3b 100755
--- a/scripts/tokenizer/remove-non-printing-char.perl
+++ b/scripts/tokenizer/remove-non-printing-char.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use utf8;
binmode(STDIN, ":utf8");
diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl
index ab1d5808d..08eb766bf 100755
--- a/scripts/tokenizer/replace-unicode-punctuation.perl
+++ b/scripts/tokenizer/replace-unicode-punctuation.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
#binmode(STDIN, ":utf8");
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 03143e467..8abffbea4 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -1,4 +1,5 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl
+
use warnings;
# Sample Tokenizer
@@ -15,10 +16,15 @@ use warnings;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
+use warnings;
use FindBin qw($RealBin);
use strict;
use Time::HiRes;
-use Thread;
+
+if (eval {require Thread;1;}) {
+ #module loaded
+ Thread->import();
+}
my $mydir = "$RealBin/../share/nonbreaking_prefixes";
diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl
index e2cce2e4b..bce7a38a0 100755
--- a/scripts/tokenizer/tokenizer_PTB.perl
+++ b/scripts/tokenizer/tokenizer_PTB.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# Sample Tokenizer
### Version 1.1
@@ -14,6 +14,7 @@
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
+use warnings;
use FindBin qw($RealBin);
use strict;
use Time::HiRes;
diff --git a/scripts/training/absolutize_moses_model.pl b/scripts/training/absolutize_moses_model.pl
index 99efafe8e..5c9c0970a 100755
--- a/scripts/training/absolutize_moses_model.pl
+++ b/scripts/training/absolutize_moses_model.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# $Id$
# given a moses.ini file, prints a copy to stdout but replaces all relative
@@ -6,6 +6,8 @@
#
# Ondrej Bojar.
+use warnings;
+
my $ini = shift;
die "usage: absolutize_moses_model.pl path-to-moses.ini > moses.abs.ini"
if !defined $ini;
diff --git a/scripts/training/analyse_moses_model.pl b/scripts/training/analyse_moses_model.pl
index 62dab218f..7a3b27e65 100755
--- a/scripts/training/analyse_moses_model.pl
+++ b/scripts/training/analyse_moses_model.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# $Id$
# given a moses.ini file, checks the translation and generation tables and reports
diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py
index 66f8f0413..cd8755580 100755
--- a/scripts/training/bilingual-lm/extract_training.py
+++ b/scripts/training/bilingual-lm/extract_training.py
@@ -147,7 +147,7 @@ def main():
#Numberize the file
for line in ngrams_file_handle:
- numberized_file_handle.write(extract.numberize(line, m, n, tvocab_idmap, tvocab_idmap))
+ numberized_file_handle.write(extract.numberize(line, options.m, options.n, svocab_idmap, tvocab_idmap))
numberized_file_handle.close()
ngrams_file_handle.close()
diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl
index 15ad23ac4..3d4798ffd 100755
--- a/scripts/training/binarize-model.perl
+++ b/scripts/training/binarize-model.perl
@@ -1,9 +1,10 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
#
# Binarize a Moses model
#
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/training/build-generation-table.perl b/scripts/training/build-generation-table.perl
index cf707811e..fb59f4acc 100755
--- a/scripts/training/build-generation-table.perl
+++ b/scripts/training/build-generation-table.perl
@@ -1,6 +1,7 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl
index 00a56977e..a7ddaff70 100755
--- a/scripts/training/build-mmsapt.perl
+++ b/scripts/training/build-mmsapt.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index 18282858d..e1e96528c 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $
+use warnings;
use strict;
use Getopt::Long;
my $help;
@@ -49,7 +50,7 @@ my $l1input = "$corpus.$l1";
if (-e $l1input) {
$opn = $l1input;
} elsif (-e $l1input.".gz") {
- $opn = "zcat $l1input.gz |";
+ $opn = "gunzip -c $l1input.gz |";
} else {
die "Error: $l1input does not exist";
}
@@ -59,7 +60,7 @@ my $l2input = "$corpus.$l2";
if (-e $l2input) {
$opn = $l2input;
} elsif (-e $l2input.".gz") {
- $opn = "zcat $l2input.gz |";
+ $opn = "gunzip -c $l2input.gz |";
} else {
die "Error: $l2input does not exist";
}
@@ -154,7 +155,7 @@ print STDERR "Input sentences: $innr Output sentences: $outnr\n";
sub word_count {
my ($line) = @_;
if ($ignore_xml) {
- $line =~ s/<\S[^>]*\S>//g;
+ $line =~ s/<\S[^>]*\S>/ /g;
$line =~ s/\s+/ /g;
$line =~ s/^ //g;
$line =~ s/ $//g;
diff --git a/scripts/training/clone_moses_model.pl b/scripts/training/clone_moses_model.pl
index 29aed3f2a..5e9dff72a 100755
--- a/scripts/training/clone_moses_model.pl
+++ b/scripts/training/clone_moses_model.pl
@@ -1,10 +1,11 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# $Id$
# given a moses.ini file, creates a fresh version of it
# in the current directory
# All relevant files are hardlinked or copied to the directory, too.
+use warnings;
use strict;
use Getopt::Long;
diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl
index 8a57a6b57..dfdf020a0 100755
--- a/scripts/training/combine_factors.pl
+++ b/scripts/training/combine_factors.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# $Id$
# given a list of files, combines them to a single corpus (sent to stdout)
diff --git a/scripts/training/convert-moses-ini-to-v2.perl b/scripts/training/convert-moses-ini-to-v2.perl
index 867c7eca7..25c562ef4 100755
--- a/scripts/training/convert-moses-ini-to-v2.perl
+++ b/scripts/training/convert-moses-ini-to-v2.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my $header = "";
diff --git a/scripts/training/corpus-sizes.perl b/scripts/training/corpus-sizes.perl
index f317c5665..02dd4ae9b 100755
--- a/scripts/training/corpus-sizes.perl
+++ b/scripts/training/corpus-sizes.perl
@@ -1,7 +1,8 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $
+use warnings;
use strict;
my ($in,$out,@PART) = @ARGV;
diff --git a/scripts/training/exodus.perl b/scripts/training/exodus.perl
index f5a5cbdea..d3466f5dd 100755
--- a/scripts/training/exodus.perl
+++ b/scripts/training/exodus.perl
@@ -1,7 +1,8 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
my @LINE = <STDIN>;
diff --git a/scripts/training/get-lexical.perl b/scripts/training/get-lexical.perl
index e23c15665..45fe6d54c 100755
--- a/scripts/training/get-lexical.perl
+++ b/scripts/training/get-lexical.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use strict;
use FindBin qw($RealBin);
BEGIN { require "$RealBin/LexicalTranslationModel.pm"; "LexicalTranslationModel"->import; }
diff --git a/scripts/training/giza2bal.pl b/scripts/training/giza2bal.pl
index 553ff2b3e..56fc9a466 100755
--- a/scripts/training/giza2bal.pl
+++ b/scripts/training/giza2bal.pl
@@ -1,4 +1,4 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl
# $Id$
#Converts direct and inverted alignments into a more compact
@@ -7,6 +7,8 @@
#Copyright Marcello Federico, November 2004
+#use warnings;
+
($cnt,$dir,$inv)=();
while ($w=shift @ARGV){
@@ -17,7 +19,7 @@ while ($w=shift @ARGV){
my $lc = 0;
-if (!$dir || !inv){
+if (!$dir || !$inv){
print "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n";
print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n";
exit(0);
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 04e174c1b..86084abbf 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
# Usage:
# mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config>
@@ -47,6 +47,7 @@
# 13 Oct 2004 Use alternative decoders (DWC)
# Original version by Philipp Koehn
+use warnings;
use strict;
use FindBin qw($RealBin);
use File::Basename;
diff --git a/scripts/training/postprocess-lopar.perl b/scripts/training/postprocess-lopar.perl
index b5ae79b2a..5171e02fb 100755
--- a/scripts/training/postprocess-lopar.perl
+++ b/scripts/training/postprocess-lopar.perl
@@ -1,7 +1,8 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
use utf8;
diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl
index c7269abf9..c265652f6 100755
--- a/scripts/training/reduce-factors.perl
+++ b/scripts/training/reduce-factors.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
@@ -10,11 +11,12 @@ my $___FACTOR_DELIMITER = "|";
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
-my ($CORPUS,$REDUCED,$FACTOR);
+my ($CORPUS,$REDUCED,$FACTOR,$_XML);
die("ERROR: wrong syntax when invoking reduce-factors")
unless &GetOptions('corpus=s' => \$CORPUS,
'reduced-corpus=s' => \$REDUCED,
- 'factor=s' => \$FACTOR);
+ 'factor=s' => \$FACTOR,
+ 'xml' => \$_XML);
&reduce_factors($CORPUS,$REDUCED,$FACTOR);
@@ -24,9 +26,9 @@ sub reduce_factors {
my @INCLUDE = sort {$a <=> $b} split(/,/,$factors);
- print "Reducing factors to produce $reduced @ ".`date`;
+ print STDERR "(1.0.5) reducing factors to produce $reduced @ ".`date`;
while(-e $reduced.".lock") {
- sleep(10);
+ sleep(10);
}
if (-e $reduced) {
print STDERR " $reduced in place, reusing\n";
@@ -37,29 +39,31 @@ sub reduce_factors {
return;
}
- # peek at input, to check if we are asked to produce exactly the
- # available factors
- my $inh = open_or_zcat($full);
- my $firstline = <$inh>;
- die "Corpus file $full is empty" unless $firstline;
- close $inh;
- # pick first word
- $firstline =~ s/^\s*//;
- $firstline =~ s/\s.*//;
- # count factors
- my @WORD = split(/ /,$firstline);
- my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]);
- my $maxfactorindex = scalar(@FACTOR)-1;
- if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
- # create just symlink; preserving compression
- my $realfull = $full;
- if (!-e $realfull && -e $realfull.".gz") {
+ unless ($_XML) {
+ # peek at input, to check if we are asked to produce exactly the
+ # available factors
+ my $inh = open_or_zcat($full);
+ my $firstline = <$inh>;
+ die "Corpus file $full is empty" unless $firstline;
+ close $inh;
+ # pick first word
+ $firstline =~ s/^\s*//;
+ $firstline =~ s/\s.*//;
+ # count factors
+ my @WORD = split(/ /,$firstline);
+ my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]);
+ my $maxfactorindex = scalar(@FACTOR)-1;
+ if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
+ # create just symlink; preserving compression
+ my $realfull = $full;
+ if (!-e $realfull && -e $realfull.".gz") {
$realfull .= ".gz";
$reduced =~ s/(\.gz)?$/.gz/;
- }
- safesystem("ln -s '$realfull' '$reduced'")
+ }
+ safesystem("ln -s '$realfull' '$reduced'")
or die "Failed to create symlink $realfull -> $reduced";
- return;
+ return;
+ }
}
# The default is to select the needed factors
@@ -71,23 +75,30 @@ sub reduce_factors {
$nr++;
print STDERR "." if $nr % 10000 == 0;
print STDERR "($nr)" if $nr % 100000 == 0;
- chomp; s/ +/ /g; s/^ //; s/ $//;
- my $first = 1;
- foreach (split) {
- my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
+ s/<\S[^>]*>/ /g if $_XML; # remove xml
+ chomp; s/ +/ /g; s/^ //; s/ $//;
+ my $first = 1;
+ foreach (split) {
+ my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
# \Q causes to disable metacharacters in regex
- print OUT " " unless $first;
- $first = 0;
- my $first_factor = 1;
+ print OUT " " unless $first;
+ $first = 0;
+ my $first_factor = 1;
foreach my $outfactor (@INCLUDE) {
- print OUT "|" unless $first_factor;
+ print OUT $___FACTOR_DELIMITER unless $first_factor;
$first_factor = 0;
my $out = $FACTOR[$outfactor];
die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
print OUT $out;
}
- }
- print OUT "\n";
+ # for(my $factor=0;$factor<=$#FACTOR;$factor++) {
+ # next unless defined($INCLUDE{$factor});
+ # print OUT "|" unless $first_factor;
+ # $first_factor = 0;
+ # print OUT $FACTOR[$factor];
+ # }
+ }
+ print OUT "\n";
}
print STDERR "\n";
close(OUT);
diff --git a/scripts/training/reduce-topt-count.pl b/scripts/training/reduce-topt-count.pl
index 15458b0b5..769f44a7e 100755
--- a/scripts/training/reduce-topt-count.pl
+++ b/scripts/training/reduce-topt-count.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# given a moses.ini, filter the phrase tables to contain
# only ttable-limit options per source phrase
diff --git a/scripts/training/reduce_combine.pl b/scripts/training/reduce_combine.pl
index 1c7908454..3d0abf29a 100755
--- a/scripts/training/reduce_combine.pl
+++ b/scripts/training/reduce_combine.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# $Id$
# given a pathname to a factored corpus, a list of (numeric) factors to keep
diff --git a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl
index b12281cf8..bd5d7f1d2 100755
--- a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl
+++ b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($ttable_file) = @ARGV;
diff --git a/scripts/training/strip-xml.perl b/scripts/training/strip-xml.perl
new file mode 100755
index 000000000..0f403d15d
--- /dev/null
+++ b/scripts/training/strip-xml.perl
@@ -0,0 +1,17 @@
+#!/usr/bin/env perl
+
+# strip text file of any XML markup
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use strict;
+
+while(<STDIN>) {
+ s/<\S[^>]*>/ /g;
+ chomp;
+ s/ +/ /g;
+ s/^ //;
+ print $_;
+ print "\n";
+}
diff --git a/scripts/training/threshold-filter.perl b/scripts/training/threshold-filter.perl
index 1d5cfbbb4..a23fb8b5c 100755
--- a/scripts/training/threshold-filter.perl
+++ b/scripts/training/threshold-filter.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my %MIN_SCORE;
diff --git a/scripts/training/train-global-lexicon-model.perl b/scripts/training/train-global-lexicon-model.perl
index f18fb6f2e..0e7d3077d 100755
--- a/scripts/training/train-global-lexicon-model.perl
+++ b/scripts/training/train-global-lexicon-model.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use Switch;
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index f92e545be..4c355479c 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
@@ -404,8 +405,8 @@ else {
$SORT_EXEC = 'sort';
}
-my $GZIP_EXEC; # = which("pigz");
-if(-f "/usr/bin/pigz") {
+my $GZIP_EXEC;
+if(`which pigz`) {
$GZIP_EXEC = 'pigz';
}
else {
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
index adb34df2f..3dd8fc4ac 100755
--- a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
index e447ee146..e61a53652 100755
--- a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py
index 69ee4f737..0e361df0b 100755
--- a/scripts/training/wrappers/conll2mosesxml.py
+++ b/scripts/training/wrappers/conll2mosesxml.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
diff --git a/scripts/training/wrappers/filter-excluded-lines.perl b/scripts/training/wrappers/filter-excluded-lines.perl
index 16584ca8f..7f9da3efa 100755
--- a/scripts/training/wrappers/filter-excluded-lines.perl
+++ b/scripts/training/wrappers/filter-excluded-lines.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long;
diff --git a/scripts/training/wrappers/find-unparseable.perl b/scripts/training/wrappers/find-unparseable.perl
index 0aa560815..b0d38027b 100755
--- a/scripts/training/wrappers/find-unparseable.perl
+++ b/scripts/training/wrappers/find-unparseable.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my $lineNum = 1;
diff --git a/scripts/training/wrappers/mada-wrapper.perl b/scripts/training/wrappers/mada-wrapper.perl
index dd83346ca..20f76f821 100755
--- a/scripts/training/wrappers/mada-wrapper.perl
+++ b/scripts/training/wrappers/mada-wrapper.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use File::Temp qw/tempfile/;
use Getopt::Long "GetOptions";
diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl
new file mode 100755
index 000000000..6e7efe245
--- /dev/null
+++ b/scripts/training/wrappers/madamira-wrapper.perl
@@ -0,0 +1,93 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use File::Temp qw/tempfile/;
+use Getopt::Long "GetOptions";
+use File::Basename;
+use FindBin qw($RealBin);
+use Cwd 'abs_path';
+
+my $TMPDIR = "tmp";
+my $SCHEME = "D2";
+my $KEEP_TMP = 0;
+my $MADA_DIR;
+
+GetOptions(
+ "scheme=s" => \$SCHEME,
+ "tmpdir=s" => \$TMPDIR,
+ "keep-tmp" => \$KEEP_TMP,
+ "mada-dir=s" => \$MADA_DIR
+ ) or die("ERROR: unknown options");
+
+$TMPDIR = abs_path($TMPDIR);
+print STDERR "TMPDIR=$TMPDIR \n";
+
+#binmode(STDIN, ":utf8");
+#binmode(STDOUT, ":utf8");
+
+$TMPDIR = "$TMPDIR/madamira.$$";
+`mkdir -p $TMPDIR`;
+`mkdir -p $TMPDIR/split`;
+`mkdir -p $TMPDIR/out`;
+
+my $infile = "$TMPDIR/input";
+print STDERR $infile."\n";
+
+open(TMP,">$infile");
+while(<STDIN>) {
+ print TMP $_;
+}
+close(TMP);
+
+my $cmd;
+
+# split input file
+my $SPLIT_EXEC = `gsplit --help 2>/dev/null`;
+if($SPLIT_EXEC) {
+ $SPLIT_EXEC = 'gsplit';
+}
+else {
+ $SPLIT_EXEC = 'split';
+}
+
+$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x";
+`$cmd`;
+
+$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml ::: $TMPDIR/split/x*";
+print STDERR "Executing: $cmd\n";
+`$cmd`;
+
+$cmd = "cat $TMPDIR/out/x*.mada > $infile.mada";
+print STDERR "Executing: $cmd\n";
+`$cmd`;
+
+# get stuff out of mada output
+open(MADA_OUT,"<$infile.mada");
+#binmode(MADA_OUT, ":utf8");
+while(my $line = <MADA_OUT>) {
+ chop($line);
+ #print STDERR "line=$line \n";
+
+ if (index($line, "SENTENCE BREAK") == 0) {
+ # new sentence
+ #print STDERR "BREAK\n";
+ print "\n";
+ }
+ elsif (index($line, ";;WORD") == 0) {
+ # word
+ my $word = substr($line, 7, length($line) - 8);
+ #print STDERR "FOund $word\n";
+ print "$word ";
+ }
+ else {
+ #print STDERR "NADA\n";
+ }
+}
+close (MADA_OUT);
+
+
+if ($KEEP_TMP == 0) {
+# `rm -rf $TMPDIR`;
+}
+
diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
index 38e331737..88d16b3f6 100755
--- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
+++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($lowercase, $cluster_file,$in,$out,$tmp) = @ARGV;
diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl
index d96a1ce00..1cc917bce 100755
--- a/scripts/training/wrappers/make-factor-de-morph.perl
+++ b/scripts/training/wrappers/make-factor-de-morph.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Encode;
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl
index 459961c77..2eadd4123 100755
--- a/scripts/training/wrappers/make-factor-de-pos.perl
+++ b/scripts/training/wrappers/make-factor-de-pos.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($in,$out,$tmpdir) = @ARGV;
diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
index c3c309bad..0d27aa12f 100755
--- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
+++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl
index d2b5a755c..2af6eb75c 100755
--- a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl
+++ b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
# handle switches
diff --git a/scripts/training/wrappers/make-factor-stem.perl b/scripts/training/wrappers/make-factor-stem.perl
index 892c1636c..60aca0b34 100755
--- a/scripts/training/wrappers/make-factor-stem.perl
+++ b/scripts/training/wrappers/make-factor-stem.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($size,$in,$out) = @ARGV;
diff --git a/scripts/training/wrappers/make-factor-suffix.perl b/scripts/training/wrappers/make-factor-suffix.perl
index 20247a013..7e864ea0c 100755
--- a/scripts/training/wrappers/make-factor-suffix.perl
+++ b/scripts/training/wrappers/make-factor-suffix.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($size,$in,$out) = @ARGV;
diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
index ef6e66024..fc1f0c532 100755
--- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
+++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py
new file mode 100755
index 000000000..bd876f087
--- /dev/null
+++ b/scripts/training/wrappers/mosesxml2brackets.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# convert trees in moses XML format to PTB-style bracketed format
+
+from __future__ import print_function, unicode_literals
+import sys
+import codecs
+
+from lxml import etree as ET
+
+def escape(word):
+ word = word.replace('|','&#124;') # factor separator
+ word = word.replace('[','&#91;') # syntax non-terminal
+ word = word.replace(']','&#93;') # syntax non-terminal
+ word = word.replace('\'','&apos;')
+ word = word.replace('\"','&quot;')
+
+ return word
+
+def make_brackets(xml):
+
+ out = ' [' + xml.get('label')
+
+ if xml.text and xml.text.strip():
+ word = escape(xml.text.strip())
+ out += ' ' + word + ']'
+
+ else:
+ for child in xml:
+ out += make_brackets(child)
+
+ out += ']'
+
+ return out
+
+
+if __name__ == '__main__':
+
+ if sys.version_info < (3, 0):
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+ for line in sys.stdin:
+ if line == '\n':
+ sys.stdout.write(line)
+ continue
+ out = make_brackets(ET.fromstring(line)).strip()
+ sys.stdout.write(out + '\n')
diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl
index 03d90eaca..68df07c49 100755
--- a/scripts/training/wrappers/parse-de-berkeley.perl
+++ b/scripts/training/wrappers/parse-de-berkeley.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/parse-de-bitpar.perl b/scripts/training/wrappers/parse-de-bitpar.perl
index f884b5c01..4723d6aa0 100755
--- a/scripts/training/wrappers/parse-de-bitpar.perl
+++ b/scripts/training/wrappers/parse-de-bitpar.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/parse-en-collins.perl b/scripts/training/wrappers/parse-en-collins.perl
index d71f1a293..27b33a2dd 100755
--- a/scripts/training/wrappers/parse-en-collins.perl
+++ b/scripts/training/wrappers/parse-en-collins.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use File::Basename;
use File::Temp qw/tempfile/;
diff --git a/scripts/training/wrappers/parse-en-egret.perl b/scripts/training/wrappers/parse-en-egret.perl
index 70403c970..c3d23a4ee 100755
--- a/scripts/training/wrappers/parse-en-egret.perl
+++ b/scripts/training/wrappers/parse-en-egret.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl
index d78106fe2..1bb616939 100755
--- a/scripts/training/wrappers/syntax-hyphen-splitting.perl
+++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/training/wrappers/tagger-german-chunk.perl b/scripts/training/wrappers/tagger-german-chunk.perl
index 1e4b5495d..4f26efabe 100755
--- a/scripts/training/wrappers/tagger-german-chunk.perl
+++ b/scripts/training/wrappers/tagger-german-chunk.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";