diff options
author | alvations <alvations@gmail.com> | 2015-04-26 21:38:27 +0300 |
---|---|---|
committer | alvations <alvations@gmail.com> | 2015-04-26 21:38:27 +0300 |
commit | fa30ea671242fedecc65675bd4f5edbca59d5053 (patch) | |
tree | bb885fafa74390cb93ac2e414870845f012df8f9 /scripts | |
parent | 4a68c42b16626e2ee707e93a6453eda51dc807a1 (diff) | |
parent | ec54ea3c4fcdb055661dba1fe3003d6bb1a0bed8 (diff) |
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder into moses-smt-master
Diffstat (limited to 'scripts')
153 files changed, 961 insertions, 361 deletions
diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl index ae5a386fa..e2b604f0b 100755 --- a/scripts/OSM/OSM-Train.perl +++ b/scripts/OSM/OSM-Train.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/OSM/extract-singletons.perl b/scripts/OSM/extract-singletons.perl index 33f857929..83719502f 100755 --- a/scripts/OSM/extract-singletons.perl +++ b/scripts/OSM/extract-singletons.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/env perl +use warnings; use Getopt::Std; getopts('q'); diff --git a/scripts/OSM/flipAlignment.perl b/scripts/OSM/flipAlignment.perl index e738802b1..3559bf79b 100755 --- a/scripts/OSM/flipAlignment.perl +++ b/scripts/OSM/flipAlignment.perl @@ -1,5 +1,7 @@ -#! /usr/bin/perl - use strict; +#!/usr/bin/env perl + +use warnings; +use strict; my $file = shift(@ARGV); open(MYFILE, $file); diff --git a/scripts/Transliteration/clean.pl b/scripts/Transliteration/clean.pl index 41a55c4eb..c59bf0798 100755 --- a/scripts/Transliteration/clean.pl +++ b/scripts/Transliteration/clean.pl @@ -1,6 +1,7 @@ -#!/usr/bin/perl +#!/usr/bin/env perl #input hindi word urdu word, delete all those entries that have number on any side +use warnings; use utf8; use Getopt::Std; diff --git a/scripts/Transliteration/corpusCreator.pl b/scripts/Transliteration/corpusCreator.pl index 8634d23dd..d2df8323c 100755 --- a/scripts/Transliteration/corpusCreator.pl +++ b/scripts/Transliteration/corpusCreator.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl index ebf1c490b..216d99a3e 100755 --- a/scripts/Transliteration/in-decoding-transliteration.pl +++ b/scripts/Transliteration/in-decoding-transliteration.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl index 578160ba2..201f40d97 100755 --- a/scripts/Transliteration/post-decoding-transliteration.pl +++ b/scripts/Transliteration/post-decoding-transliteration.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl index dfd1ed4de..4fc03b526 100755 --- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl +++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/Transliteration/threshold.pl b/scripts/Transliteration/threshold.pl index 9b34bd12c..8e3704fd6 100755 --- a/scripts/Transliteration/threshold.pl +++ b/scripts/Transliteration/threshold.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/env perl +use warnings; use utf8; require Encode; use IO::Handle; diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl index ed7f32097..05804afb6 100755 --- a/scripts/Transliteration/train-transliteration-module.pl +++ b/scripts/Transliteration/train-transliteration-module.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use utf8; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl index 50492cad0..149676b6f 100755 --- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl +++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl use utf8; ############################################### @@ -14,6 +14,7 @@ use utf8; # 23.01.2010: added NIST p-value and interval computation ############################################### +use warnings; use strict; #constants diff --git a/scripts/analysis/nontranslated_words.pl b/scripts/analysis/nontranslated_words.pl index 8fd3c4fbc..b5639429b 100755 --- a/scripts/analysis/nontranslated_words.pl +++ b/scripts/analysis/nontranslated_words.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # $Id$ # Reads a source and hypothesis file and counts equal tokens. Some of these diff --git a/scripts/analysis/oov.pl b/scripts/analysis/oov.pl index 15261c410..c5d6f92e3 100755 --- a/scripts/analysis/oov.pl +++ b/scripts/analysis/oov.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Display OOV rate of a test set against a training corpus or a phrase table. # Ondrej Bojar diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl index 82ae57949..4f6560a56 100755 --- a/scripts/analysis/sentence-by-sentence.pl +++ b/scripts/analysis/sentence-by-sentence.pl @@ -1,9 +1,10 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors #usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.html +use warnings; use strict; use Getopt::Long; diff --git a/scripts/analysis/sg2dot.perl b/scripts/analysis/sg2dot.perl index f6a5dff49..b17dfd9fb 100755 --- a/scripts/analysis/sg2dot.perl +++ b/scripts/analysis/sg2dot.perl @@ -1,9 +1,10 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # Author : Loic BARRAULT # Script to convert MOSES searchgraph to DOT format # +use warnings; use strict; use File::Path; use File::Basename; diff --git a/scripts/analysis/show-phrases-used.pl b/scripts/analysis/show-phrases-used.pl index 5fedf73f1..0a719d207 100755 --- a/scripts/analysis/show-phrases-used.pl +++ b/scripts/analysis/show-phrases-used.pl @@ -1,11 +1,13 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ #show-phrases-used: display all source and target phrases for each sentence in a corpus, and give average phrase length used #usage: show-phrases-used DECODER_OUTFILE > output.html # where DECODER_OUTFILE is the output of moses with the -T (show alignments) option +use warnings; use strict; + BEGIN { my $wd= `pawd 2>/dev/null`; diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl index db51da63d..9f411f3fa 100755 --- a/scripts/analysis/smtgui/filter-phrase-table.pl +++ b/scripts/analysis/smtgui/filter-phrase-table.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ #by Philipp Koehn, de-augmented by Evan Herbst @@ -9,6 +9,7 @@ #similar function to filter-model-given-input.pl, but only operates #on the phrase table and doesn't require that any subdirectories exist +use warnings; use strict; my $MAX_LENGTH = 10; diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl index 29e32d271..d1e5c1f67 100755 --- a/scripts/analysis/suspicious_tokenization.pl +++ b/scripts/analysis/suspicious_tokenization.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Collects and prints all n-grams that appear in the given corpus both # tokenized as well as untokenized. # Ondrej Bojar diff --git a/scripts/analysis/weight-scan.pl b/scripts/analysis/weight-scan.pl index 6789c4d6d..7283483e9 100755 --- a/scripts/analysis/weight-scan.pl +++ b/scripts/analysis/weight-scan.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # runs Moses many times changing the values of one weight, all others fixed # nbest lists are always produced to allow for comparison of real and # 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index f9a400eef..57ef4f9d6 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -7,8 +7,15 @@ get-corpus default-name: corpus/txt rerun-on-change: input-extension output-extension template: IN OUT $input-extension $output-extension +pre-tok-clean + in: raw-stem + out: pre-tok-cleaned + default-name: corpus/pre-tok-cleaned + pass-unless: pre-tok-clean + template: $pre-tok-clean IN $input-extension $output-extension OUT OUT.lines-retained + parallelizable: yes tokenize - in: raw-stem + in: pre-tok-cleaned out: tokenized-stem default-name: corpus/tok pass-unless: input-tokenizer output-tokenizer @@ -158,11 +165,18 @@ get-corpus pass-unless: get-corpus-script default-name: lm/txt template: $get-corpus-script > OUT +use-parallel-corpus + in: parallel-corpus-stem + out: tokenized-corpus + default-name: lm/tok + ignore-unless: parallel-corpus-stem + template: ln -s IN.$output-extension OUT tokenize in: raw-corpus out: tokenized-corpus default-name: lm/tok pass-unless: output-tokenizer + ignore-if: parallel-corpus-stem template: $output-tokenizer < IN > OUT parallelizable: yes mock-parse @@ -185,7 +199,7 @@ lowercase default-name: lm/lowercased pass-unless: output-lowercaser ignore-if: output-truecaser - only-factor-0: yes + #only-factor-0: yes template: $output-lowercaser < IN > OUT parallelizable: yes truecase @@ -204,8 +218,14 @@ split default-name: lm/split pass-unless: output-splitter template: $output-splitter -model IN1.$output-extension < IN > OUT +strip + in: split-corpus + out: stripped-corpus + default-name: lm/stripped + pass-unless: mock-output-parser-lm + template: $moses-script-dir/training/strip-xml.perl < IN > OUT train - in: split-corpus + in: stripped-corpus out: lm default-name: lm/lm ignore-if: rlm-training @@ -220,7 +240,7 @@ randomize pass-unless: lm-randomizer ignore-if: rlm-training train-randomized - in: split-corpus + in: stripped-corpus out: rlm default-name: lm/rlm ignore-unless: rlm-training @@ -940,19 +960,34 @@ truecase-reference-devtest template: $output-truecaser -model IN1.$output-extension < IN > OUT split-reference in: truecased-reference SPLITTER:splitter-model - out: reference + out: split-ref default-name: tuning/reference.split pass-unless: output-splitter multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-splitter -model IN1.$output-extension < IN > OUT split-reference-devtest in: truecased-reference-devtest SPLITTER:splitter-model - out: reference-devtest + out: split-ref-devtest default-name: tuning/reference.devtest.split pass-unless: output-splitter ignore-unless: use-mira multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-splitter -model IN1.$output-extension < IN > OUT +strip-reference + in: split-ref + out: reference + default-name: tuning/reference.stripped + pass-unless: mock-output-parser-references + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees +strip-reference-devtest + in: split-ref-devtest + out: reference + default-name: tuning/reference.devtest.stripped + pass-unless: mock-output-parser-references + ignore-unless: use-mira + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees filter in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table out: filtered-dir @@ -1203,12 +1238,19 @@ mock-parse-reference template: $mock-output-parser-references < IN > OUT lowercase-reference in: mock-parsed-reference - out: reference - default-name: evaluation/reference + out: lowercased-reference + default-name: evaluation/reference.lowercased pass-unless: output-lowercaser - pass-if: recaser + pass-if: recaser multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-lowercaser < IN > OUT +strip-reference + in: lowercased-reference + out: reference + default-name: evaluation/reference + pass-unless: mock-output-parser-references + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees wade in: filtered-dir truecased-input tokenized-reference alignment system-output out: wade-analysis diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 7a5e81eec..7070a7c9e 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1,8 +1,9 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # Experiment Management System # Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); @@ -18,7 +19,18 @@ sub trim($) my $host = `hostname`; chop($host); print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`; -my ($CONFIG_FILE,$EXECUTE,$NO_GRAPH,$CONTINUE,$FINAL_STEP,$FINAL_OUT,$VERBOSE,$IGNORE_TIME,$DELETE_CRASHED,$DELETE_VERSION); +my ($CONFIG_FILE, + $EXECUTE, + $NO_GRAPH, + $CONTINUE, + $FINAL_STEP, + $FINAL_OUT, + $VERBOSE, + $IGNORE_TIME, + $DELETE_CRASHED, + $DELETE_VERSION + ); + my $SLEEP = 2; my $META = "$RealBin/experiment.meta"; @@ -3442,7 +3454,7 @@ sub create_step { $subdir = "lm" if $subdir eq "interpolated-lm"; open(STEP,">$file") or die "Cannot open: $!"; print STEP "#!/bin/bash\n\n"; - print STEP "PATH=\"".$ENV{"PATH"}."\"\n"; + print STEP "PATH=\"".$ENV{"PATH"}."\"\n"; print STEP "cd $dir\n"; print STEP "echo 'starting at '`date`' on '`hostname`\n"; print STEP "mkdir -p $dir/$subdir\n\n"; diff --git a/scripts/ems/fix-info.perl b/scripts/ems/fix-info.perl index 924a1a990..8f83d4ccf 100755 --- a/scripts/ems/fix-info.perl +++ b/scripts/ems/fix-info.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my ($file,$step) = @ARGV; diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index be5b76a5e..cea2657c9 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/ems/support/build-domain-file-from-subcorpora.perl b/scripts/ems/support/build-domain-file-from-subcorpora.perl index e85b6ad84..f166c8927 100755 --- a/scripts/ems/support/build-domain-file-from-subcorpora.perl +++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; # Create domain file from corpora diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl index 722f02701..5d9b786ad 100755 --- a/scripts/ems/support/build-sparse-features.perl +++ b/scripts/ems/support/build-sparse-features.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; # Build necessary files for sparse lexical features diff --git a/scripts/ems/support/consolidate-training-data.perl b/scripts/ems/support/consolidate-training-data.perl index 7ee0652d2..170ba999c 100755 --- a/scripts/ems/support/consolidate-training-data.perl +++ b/scripts/ems/support/consolidate-training-data.perl @@ -1,7 +1,8 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ +use warnings; use strict; my ($in,$out,$consolidated,@PART) = @ARGV; diff --git a/scripts/ems/support/generic-multicore-parallelizer.perl b/scripts/ems/support/generic-multicore-parallelizer.perl index d7e030ad2..e5a12adce 100755 --- a/scripts/ems/support/generic-multicore-parallelizer.perl +++ b/scripts/ems/support/generic-multicore-parallelizer.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my $cores = 8; diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl index fa2d778a2..0b248be7e 100755 --- a/scripts/ems/support/generic-parallelizer.perl +++ b/scripts/ems/support/generic-parallelizer.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my $jobs = 20; diff --git a/scripts/ems/support/input-from-sgm.perl b/scripts/ems/support/input-from-sgm.perl index de888a6f3..223996676 100755 --- a/scripts/ems/support/input-from-sgm.perl +++ b/scripts/ems/support/input-from-sgm.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; die("ERROR syntax: input-from-sgm.perl < in.sgm > in.txt") diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl index 8380f26ca..a2fe62b22 100755 --- a/scripts/ems/support/interpolate-lm.perl +++ b/scripts/ems/support/interpolate-lm.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use IPC::Open3; use File::Temp qw/tempdir/; diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl index 252c32c37..eadca6263 100755 --- a/scripts/ems/support/lmplz-wrapper.perl +++ b/scripts/ems/support/lmplz-wrapper.perl @@ -1,27 +1,26 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; -my ($TEXT,$ORDER,$PRUNE,$BIN,$LM,$MEMORY,$TMP,$DISCOUNT_FALLBACK); +Getopt::Long::config("no_auto_abbrev"); +Getopt::Long::config("pass_through"); + + +my ($TEXT,$ORDER,$BIN,$LM); &GetOptions('text=s' => \$TEXT, 'lm=s' => \$LM, 'bin=s' => \$BIN, - 'prune=s' => \$PRUNE, - 'discount_fallback' => \$DISCOUNT_FALLBACK, - 'T=s' => \$TMP, - 'S=s' => \$MEMORY, 'order=i' => \$ORDER); -die("ERROR: specify at least --text CORPUS --arpa LM and --order N!") - unless defined($TEXT) && defined($LM) && defined($ORDER); +die("ERROR: specify at least --bin BIN --text CORPUS --lm LM and --order N!") + unless defined($BIN) && defined($TEXT) && defined($LM) && defined($ORDER); -my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM"; -$cmd .= " --prune $PRUNE" if defined($PRUNE); -$cmd .= " -S $MEMORY" if defined($MEMORY); -$cmd .= " -T $TMP" if defined($TMP); -$cmd .= " --discount_fallback" if defined($DISCOUNT_FALLBACK); +my $settings = join(' ', @ARGV); +#print STDERR "settngs=$settings \n"; +my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM $settings"; print "exec: $cmd\n"; `$cmd`; diff --git a/scripts/ems/support/mml-filter.perl b/scripts/ems/support/mml-filter.perl index f46b132a3..c50725aae 100755 --- a/scripts/ems/support/mml-filter.perl +++ b/scripts/ems/support/mml-filter.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use FindBin qw($RealBin); diff --git a/scripts/ems/support/mml-score.perl b/scripts/ems/support/mml-score.perl index 86ae867f1..449d6a05c 100755 --- a/scripts/ems/support/mml-score.perl +++ b/scripts/ems/support/mml-score.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; # diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl index f68e0163f..1f0548082 100755 --- a/scripts/ems/support/mml-train.perl +++ b/scripts/ems/support/mml-train.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my ($indomain_source,,$indomain_target,$outdomain_source,$outdomain_target,$lm_training,$lm_binarizer,$order,$lm_settings,$line_count,$model); diff --git a/scripts/ems/support/prepare-fast-align.perl b/scripts/ems/support/prepare-fast-align.perl index 1d95ea972..54c124af0 100755 --- a/scripts/ems/support/prepare-fast-align.perl +++ b/scripts/ems/support/prepare-fast-align.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my ($source_file,$target_file,$alignment_factors) = @ARGV; @@ -22,7 +23,7 @@ while(my $source = <SOURCE>) { # remove markup foreach my $line (\$source,\$target) { - $$line =~ s/\<[^\>]+\>//g; + $$line =~ s/\<[^\>]+\>/ /g; $$line =~ s/\s+/ /g; $$line =~ s/^ //; $$line =~ s/ $//; diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl index c504c3e75..595226bf1 100755 --- a/scripts/ems/support/reference-from-sgm.perl +++ b/scripts/ems/support/reference-from-sgm.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; die("ERROR syntax: reference-from-sgm.perl ref src out") diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl index 18918c905..d6333f813 100755 --- a/scripts/ems/support/remove-segmentation-markup.perl +++ b/scripts/ems/support/remove-segmentation-markup.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; $|++; diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl index 67963834a..2e433f291 100755 --- a/scripts/ems/support/report-experiment-scores.perl +++ b/scripts/ems/support/report-experiment-scores.perl @@ -1,7 +1,8 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $ +use warnings; use strict; my $email; diff --git a/scripts/ems/support/run-command-on-multiple-refsets.perl b/scripts/ems/support/run-command-on-multiple-refsets.perl index 972f5602d..c3db3c4dc 100755 --- a/scripts/ems/support/run-command-on-multiple-refsets.perl +++ b/scripts/ems/support/run-command-on-multiple-refsets.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; die("ERROR: syntax: run-command-on-multiple-refsets.perl cmd in out") diff --git a/scripts/ems/support/run-wade.perl b/scripts/ems/support/run-wade.perl index 418ff8c97..25cda3bb3 100755 --- a/scripts/ems/support/run-wade.perl +++ b/scripts/ems/support/run-wade.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/env perl +use warnings; use strict; use File::Temp qw/ tempfile tempdir /; diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index cf7174484..f1af451b3 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # Based on Preprocessor written by Philipp Koehn @@ -6,6 +6,7 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); +use warnings; use FindBin qw($RealBin); use strict; diff --git a/scripts/ems/support/submit-grid.perl b/scripts/ems/support/submit-grid.perl new file mode 100755 index 000000000..9997241e7 --- /dev/null +++ b/scripts/ems/support/submit-grid.perl @@ -0,0 +1,62 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use Cwd; +use FindBin qw($RealBin); +use Getopt::Long; +use File::Basename; + + +my $continue = 0; +my $args = ""; +my $config; + +GetOptions("continue=i" => \$continue, + "args=s" => \$args, + "config=s" => \$config + ) or exit 1; +#print STDERR "args=$args\n"; + +# create temp run file +my $gridDir = cwd() ."/grid"; +mkdir $gridDir; + +my $runPath = "$gridDir/run.$$"; +print STDERR "runPath=$runPath\n"; + +open (my $runFile, ">", $runPath); + +print $runFile "#!/bin/bash\n"; +print $runFile "#PBS -d" .cwd() ."\n\n"; + +my $path = $ENV{"PATH"}; +my $user = $ENV{"USER"}; +#print STDERR "path=$path\n"; + +print $runFile "export PATH=\"$path\"\n\n"; +print $runFile "export PERL5LIB=\"/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1:/home/$user/perl5/lib/perl5\"\n\n"; + +print $runFile "module load NYUAD/2.0 \n"; +print $runFile "module load gcc python/2.7.9 boost cmake zlib jdk perl expat \n\n"; + +my $emsDir = dirname($RealBin); + +if ($continue) { + print $runFile "nice ionice -c 3 $emsDir/experiment.perl -exec -continue=$continue \n\n"; +} +else { + print $runFile "nice ionice -c 3 $emsDir/experiment.perl -exec -config=$config \n\n"; +} + +close $runFile; + + +my $cmd = "qsub $args $runPath"; +`$cmd`; + +unlink $runFile; + + + + diff --git a/scripts/ems/support/substitute-filtered-tables-and-weights.perl b/scripts/ems/support/substitute-filtered-tables-and-weights.perl index 3a135b44e..681d251c7 100755 --- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl +++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl index be1509b8f..e7d9f55f8 100755 --- a/scripts/ems/support/substitute-filtered-tables.perl +++ b/scripts/ems/support/substitute-filtered-tables.perl @@ -1,4 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl + +use warnings; # experiment.perl support script # get filtered rule and reordering tables and place them into a configuration file diff --git a/scripts/ems/support/substitute-weights.perl b/scripts/ems/support/substitute-weights.perl index 9a72dec8a..42357ed1e 100755 --- a/scripts/ems/support/substitute-weights.perl +++ b/scripts/ems/support/substitute-weights.perl @@ -1,4 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl + +use warnings; # experiment.perl support script # get filtered rule and reordering tables and place them into a configuration file diff --git a/scripts/ems/support/symmetrize-fast-align.perl b/scripts/ems/support/symmetrize-fast-align.perl index 40583ee15..90621dea9 100755 --- a/scripts/ems/support/symmetrize-fast-align.perl +++ b/scripts/ems/support/symmetrize-fast-align.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; die("ERROR: syntax is fastalign2bal.perl direct-alignment inverse-alignment source-file target-file out-stem symmetrization-method symal\n") unless scalar(@ARGV) == 7; diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl index e6f7839f1..222623c5b 100755 --- a/scripts/ems/support/thot-lm-wrapper.perl +++ b/scripts/ems/support/thot-lm-wrapper.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/ems/support/tree-converter-wrapper.perl b/scripts/ems/support/tree-converter-wrapper.perl index aae55991a..a37654cf1 100755 --- a/scripts/ems/support/tree-converter-wrapper.perl +++ b/scripts/ems/support/tree-converter-wrapper.perl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl use warnings; use strict; diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index beeca6cdd..28708a62a 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my ($language,$src,$system) = @ARGV; diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php index a64d5977f..00bb9e15f 100644 --- a/scripts/ems/web/analysis.php +++ b/scripts/ems/web/analysis.php @@ -1261,8 +1261,8 @@ function input_annotation($sentence,$input,$segmentation,$filter) { for($j=$from;$j<=$to;$j++) { if ($j>$from) { $phrase .= " "; } $phrase .= $word[$j]; - $highlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='#ffff80';"; - $lowlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';"; + $highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';"; + $lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';"; } print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($phrase)."');\"":"").">"; } @@ -1443,10 +1443,10 @@ function biconcor($query) { $sentence = $_GET['sentence']; $biconcor = get_biconcor_version($dir,$set,$id); print "<center> -<form method=get id=\"BiconcorForm\"> +<form method=\"get\" id=\"BiconcorForm\" onsubmit=\"return false;\"> <img src=\"close.gif\" width=17 height=17 onClick=\"close_biconcor($sentence);\"> <input width=20 id=\"BiconcorQuery\" value=\"$query\"> -<input type=submit onclick=\"show_biconcor($sentence,encodeBase64(document.getElementById('BiconcorQuery').value));\" value=\"look up\"> +<input type=submit onclick=\"show_biconcor($sentence,Base64.encode(document.getElementById('BiconcorQuery').value));\" value=\"look up\"> </form> <div class=\"biconcor-content\">"; $cmd = "./biconcor -html -l $dir/model/biconcor.$biconcor -Q ".base64_encode($query)." 2>/dev/null"; diff --git a/scripts/ems/web/base64.js b/scripts/ems/web/base64.js index e0e94d765..67fd9ad8d 100644 --- a/scripts/ems/web/base64.js +++ b/scripts/ems/web/base64.js @@ -1,108 +1,193 @@ -var END_OF_INPUT = -1; +/* + * $Id: base64.js,v 2.15 2014/04/05 12:58:57 dankogai Exp dankogai $ + * + * Licensed under the MIT license. + * http://opensource.org/licenses/mit-license + * + * References: + * http://en.wikipedia.org/wiki/Base64 + */ -var base64Chars = new Array( - 'A','B','C','D','E','F','G','H', - 'I','J','K','L','M','N','O','P', - 'Q','R','S','T','U','V','W','X', - 'Y','Z','a','b','c','d','e','f', - 'g','h','i','j','k','l','m','n', - 'o','p','q','r','s','t','u','v', - 'w','x','y','z','0','1','2','3', - '4','5','6','7','8','9','+','/' -); - -var reverseBase64Chars = new Array(); -for (var i=0; i < base64Chars.length; i++){ - reverseBase64Chars[base64Chars[i]] = i; -} - -var base64Str; -var base64Count; -function setBase64Str(str){ - base64Str = str; - base64Count = 0; -} -function readBase64(){ - if (!base64Str) return END_OF_INPUT; - if (base64Count >= base64Str.length) return END_OF_INPUT; - var c = base64Str.charCodeAt(base64Count) & 0xff; - base64Count++; - return c; -} -function encodeBase64(str){ - setBase64Str(str); - var result = ''; - var inBuffer = new Array(3); - var lineCount = 0; - var done = false; - while (!done && (inBuffer[0] = readBase64()) != END_OF_INPUT){ - inBuffer[1] = readBase64(); - inBuffer[2] = readBase64(); - result += (base64Chars[ inBuffer[0] >> 2 ]); - if (inBuffer[1] != END_OF_INPUT){ - result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30) | (inBuffer[1] >> 4) ]); - if (inBuffer[2] != END_OF_INPUT){ - result += (base64Chars [((inBuffer[1] << 2) & 0x3c) | (inBuffer[2] >> 6) ]); - result += (base64Chars [inBuffer[2] & 0x3F]); - } else { - result += (base64Chars [((inBuffer[1] << 2) & 0x3c)]); - result += ('='); - done = true; - } +(function(global) { + 'use strict'; + // existing version for noConflict() + var _Base64 = global.Base64; + var version = "2.1.7"; + // if node.js, we use Buffer + var buffer; + if (typeof module !== 'undefined' && module.exports) { + buffer = require('buffer').Buffer; + } + // constants + var b64chars + = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'; + var b64tab = function(bin) { + var t = {}; + for (var i = 0, l = bin.length; i < l; i++) t[bin.charAt(i)] = i; + return t; + }(b64chars); + var fromCharCode = String.fromCharCode; + // encoder stuff + var cb_utob = function(c) { + if (c.length < 2) { + var cc = c.charCodeAt(0); + return cc < 0x80 ? c + : cc < 0x800 ? (fromCharCode(0xc0 | (cc >>> 6)) + + fromCharCode(0x80 | (cc & 0x3f))) + : (fromCharCode(0xe0 | ((cc >>> 12) & 0x0f)) + + fromCharCode(0x80 | ((cc >>> 6) & 0x3f)) + + fromCharCode(0x80 | ( cc & 0x3f))); } else { - result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30)]); - result += ('='); - result += ('='); - done = true; - } - lineCount += 4; - if (lineCount >= 76){ - result += ('\n'); - lineCount = 0; + var cc = 0x10000 + + (c.charCodeAt(0) - 0xD800) * 0x400 + + (c.charCodeAt(1) - 0xDC00); + return (fromCharCode(0xf0 | ((cc >>> 18) & 0x07)) + + fromCharCode(0x80 | ((cc >>> 12) & 0x3f)) + + fromCharCode(0x80 | ((cc >>> 6) & 0x3f)) + + fromCharCode(0x80 | ( cc & 0x3f))); } + }; + var re_utob = /[\uD800-\uDBFF][\uDC00-\uDFFFF]|[^\x00-\x7F]/g; + var utob = function(u) { + return u.replace(re_utob, cb_utob); + }; + var cb_encode = function(ccc) { + var padlen = [0, 2, 1][ccc.length % 3], + ord = ccc.charCodeAt(0) << 16 + | ((ccc.length > 1 ? ccc.charCodeAt(1) : 0) << 8) + | ((ccc.length > 2 ? ccc.charCodeAt(2) : 0)), + chars = [ + b64chars.charAt( ord >>> 18), + b64chars.charAt((ord >>> 12) & 63), + padlen >= 2 ? '=' : b64chars.charAt((ord >>> 6) & 63), + padlen >= 1 ? '=' : b64chars.charAt(ord & 63) + ]; + return chars.join(''); + }; + var btoa = global.btoa ? function(b) { + return global.btoa(b); + } : function(b) { + return b.replace(/[\s\S]{1,3}/g, cb_encode); + }; + var _encode = buffer ? function (u) { + return (u.constructor === buffer.constructor ? u : new buffer(u)) + .toString('base64') } - return result; -} -function readReverseBase64(){ - if (!base64Str) return END_OF_INPUT; - while (true){ - if (base64Count >= base64Str.length) return END_OF_INPUT; - var nextCharacter = base64Str.charAt(base64Count); - base64Count++; - if (reverseBase64Chars[nextCharacter]){ - return reverseBase64Chars[nextCharacter]; + : function (u) { return btoa(utob(u)) } + ; + var encode = function(u, urisafe) { + return !urisafe + ? _encode(String(u)) + : _encode(String(u)).replace(/[+\/]/g, function(m0) { + return m0 == '+' ? '-' : '_'; + }).replace(/=/g, ''); + }; + var encodeURI = function(u) { return encode(u, true) }; + // decoder stuff + var re_btou = new RegExp([ + '[\xC0-\xDF][\x80-\xBF]', + '[\xE0-\xEF][\x80-\xBF]{2}', + '[\xF0-\xF7][\x80-\xBF]{3}' + ].join('|'), 'g'); + var cb_btou = function(cccc) { + switch(cccc.length) { + case 4: + var cp = ((0x07 & cccc.charCodeAt(0)) << 18) + | ((0x3f & cccc.charCodeAt(1)) << 12) + | ((0x3f & cccc.charCodeAt(2)) << 6) + | (0x3f & cccc.charCodeAt(3)), + offset = cp - 0x10000; + return (fromCharCode((offset >>> 10) + 0xD800) + + fromCharCode((offset & 0x3FF) + 0xDC00)); + case 3: + return fromCharCode( + ((0x0f & cccc.charCodeAt(0)) << 12) + | ((0x3f & cccc.charCodeAt(1)) << 6) + | (0x3f & cccc.charCodeAt(2)) + ); + default: + return fromCharCode( + ((0x1f & cccc.charCodeAt(0)) << 6) + | (0x3f & cccc.charCodeAt(1)) + ); } - if (nextCharacter == 'A') return 0; + }; + var btou = function(b) { + return b.replace(re_btou, cb_btou); + }; + var cb_decode = function(cccc) { + var len = cccc.length, + padlen = len % 4, + n = (len > 0 ? b64tab[cccc.charAt(0)] << 18 : 0) + | (len > 1 ? b64tab[cccc.charAt(1)] << 12 : 0) + | (len > 2 ? b64tab[cccc.charAt(2)] << 6 : 0) + | (len > 3 ? b64tab[cccc.charAt(3)] : 0), + chars = [ + fromCharCode( n >>> 16), + fromCharCode((n >>> 8) & 0xff), + fromCharCode( n & 0xff) + ]; + chars.length -= [0, 0, 2, 1][padlen]; + return chars.join(''); + }; + var atob = global.atob ? function(a) { + return global.atob(a); + } : function(a){ + return a.replace(/[\s\S]{1,4}/g, cb_decode); + }; + var _decode = buffer ? function(a) { + return (a.constructor === buffer.constructor + ? a : new buffer(a, 'base64')).toString(); } - return END_OF_INPUT; -} -function ntos(n){ - n=n.toString(16); - if (n.length == 1) n="0"+n; - n="%"+n; - return unescape(n); -} - -function decodeBase64(str){ - setBase64Str(str); - var result = ""; - var inBuffer = new Array(4); - var done = false; - while (!done && (inBuffer[0] = readReverseBase64()) != END_OF_INPUT - && (inBuffer[1] = readReverseBase64()) != END_OF_INPUT){ - inBuffer[2] = readReverseBase64(); - inBuffer[3] = readReverseBase64(); - result += ntos((((inBuffer[0] << 2) & 0xff)| inBuffer[1] >> 4)); - if (inBuffer[2] != END_OF_INPUT){ - result += ntos((((inBuffer[1] << 4) & 0xff)| inBuffer[2] >> 2)); - if (inBuffer[3] != END_OF_INPUT){ - result += ntos((((inBuffer[2] << 6) & 0xff) | inBuffer[3])); - } else { - done = true; - } - } else { - done = true; - } + : function(a) { return btou(atob(a)) }; + var decode = function(a){ + return _decode( + String(a).replace(/[-_]/g, function(m0) { return m0 == '-' ? '+' : '/' }) + .replace(/[^A-Za-z0-9\+\/]/g, '') + ); + }; + var noConflict = function() { + var Base64 = global.Base64; + global.Base64 = _Base64; + return Base64; + }; + // export Base64 + global.Base64 = { + VERSION: version, + atob: atob, + btoa: btoa, + fromBase64: decode, + toBase64: encode, + utob: utob, + encode: encode, + encodeURI: encodeURI, + btou: btou, + decode: decode, + noConflict: noConflict + }; + // if ES5 is available, make Base64.extendString() available + if (typeof Object.defineProperty === 'function') { + var noEnum = function(v){ + return {value:v,enumerable:false,writable:true,configurable:true}; + }; + global.Base64.extendString = function () { + Object.defineProperty( + String.prototype, 'fromBase64', noEnum(function () { + return decode(this) + })); + Object.defineProperty( + String.prototype, 'toBase64', noEnum(function (urisafe) { + return encode(this, urisafe) + })); + Object.defineProperty( + String.prototype, 'toBase64URI', noEnum(function () { + return encode(this, true) + })); + }; } - return result; + // that's it! +})(this); + +if (this['Meteor']) { + Base64 = global.Base64; // for normal export in Meteor.js } diff --git a/scripts/ems/web/bilingual-concordance.css b/scripts/ems/web/bilingual-concordance.css index e232337d2..4648a21dd 100644 --- a/scripts/ems/web/bilingual-concordance.css +++ b/scripts/ems/web/bilingual-concordance.css @@ -93,5 +93,6 @@ span.mismatch_aligned { td.pp_more { font-size: 70%; + color: navy; text-align: center; } diff --git a/scripts/ems/web/index.php b/scripts/ems/web/index.php index 6b785cf3f..d216b114a 100644 --- a/scripts/ems/web/index.php +++ b/scripts/ems/web/index.php @@ -8,7 +8,7 @@ require("diff.php"); require("sgviz.php"); function head($title) { - print '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> + print '<!DOCTYPE html> <html><head><title>'.$title.'</title> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <script language="javascript" src="javascripts/prototype.js"></script> diff --git a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc index 21f8c8cf6..57f78eb53 100644 --- a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc +++ b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc @@ -32,8 +32,8 @@ in a directory of your website, e.g. /javascripts. Now, you can include the scripts by adding the following tags to the HEAD section of your HTML pages: - <script src="/javascripts/prototype.js" type="text/javascript"></script> - <script src="/javascripts/scriptaculous.js" type="text/javascript"></script> + <script src="javascripts/prototype.js" type="text/javascript"></script> + <script src="javascripts/scriptaculous.js" type="text/javascript"></script> scriptaculous.js will automatically load the other files of the script.aculo.us distribution in, provided they are accessible @@ -56,4 +56,4 @@ the sources of the examples provided. == License script.aculo.us is licensed under the terms of the MIT License, -see the included MIT-LICENSE file.
\ No newline at end of file +see the included MIT-LICENSE file. diff --git a/scripts/ems/web/overview.php b/scripts/ems/web/overview.php index e56ed6f08..ce0434bb8 100644 --- a/scripts/ems/web/overview.php +++ b/scripts/ems/web/overview.php @@ -1,6 +1,5 @@ <?php -date_default_timezone_set('Europe/London'); function setup() { $setup = file("setup"); @@ -13,7 +12,7 @@ function setup() { print "<TR><TD><A HREF=\"?setup=$dir[0]\">$dir[0]</A></TD><TD>$dir[1]</TD><TD>$dir[2]</TD><TD>$dir[3]</TD></TR>\n"; } print "</TABLE>\n"; - print "<P>To add experiment, edit /fs/thor4/html/experiment/setup"; + print "<p>To add experiment, edit the \"setup\" file.</p>"; } function overview() { @@ -26,7 +25,7 @@ function overview() { head("Task: $task ($user)"); print "<a href=\"http://www.statmt.org/wiki/?n=Experiment.$setup\">Wiki Notes</a>"; - print " | <a href=\"/\">Overview of experiments</a> | <code>$dir</code><p>"; + print " | <a href=\"?\">Overview of experiments</a> | <code>$dir</code><p>"; reset($experiment); print "<form action=\"\" method=get>\n"; diff --git a/scripts/ems/web/progress.perl b/scripts/ems/web/progress.perl index 6e26a7881..fd742e410 100755 --- a/scripts/ems/web/progress.perl +++ b/scripts/ems/web/progress.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Date::Parse; diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl index 4adc97ca2..80a1b3120 100755 --- a/scripts/fuzzy-match/create_xml.perl +++ b/scripts/fuzzy-match/create_xml.perl @@ -1,8 +1,9 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl binmode( STDIN, ":utf8" ); binmode( STDOUT, ":utf8" ); +use warnings; use strict; use FindBin qw($RealBin); use File::Basename; diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index beca70eb0..c0b25f519 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/generic/extract-factors.pl b/scripts/generic/extract-factors.pl index fdd30082f..56c719051 100755 --- a/scripts/generic/extract-factors.pl +++ b/scripts/generic/extract-factors.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ #extract-factors.pl: extract only the desired factors from a factored corpus @@ -6,6 +6,7 @@ #factor indices start at 0 #factor indices too large ought to be ignored +use warnings; use strict; my ($filename, @factors) = @ARGV; diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 71032ce1a..2b02fa869 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -1,8 +1,9 @@ -#! /usr/bin/perl -w +#!/usr/bin/env perl # example # ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput +use warnings; use strict; use File::Basename; @@ -32,8 +33,8 @@ my $glueFile; my $phraseOrientation = 0; my $phraseOrientationPriorsFile; -my $GZIP_EXEC; # = which("pigz"); -if(-f "/usr/bin/pigz") { +my $GZIP_EXEC; +if(`which pigz`) { $GZIP_EXEC = 'pigz'; } else { diff --git a/scripts/generic/fsa2fsal.pl b/scripts/generic/fsa2fsal.pl index d13c87310..50bff1404 100755 --- a/scripts/generic/fsa2fsal.pl +++ b/scripts/generic/fsa2fsal.pl @@ -1,10 +1,11 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl # A very simple script that converts fsa format (openfst lattices) to the same # thing represented one sentence per line. It uses '|||' to delimit columns and # ' ' to delimit nodes (i.e. original lines). # Some rudimentary sanity checks are done on the fly. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +use warnings; use strict; my $errs = 0; diff --git a/scripts/generic/fsa2plf.pl b/scripts/generic/fsa2plf.pl index debf8b60d..4e7454a9f 100755 --- a/scripts/generic/fsa2plf.pl +++ b/scripts/generic/fsa2plf.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Converts AT&T FSA format to 'python lattice format'. # Note that the input FSA needs to be epsilon-free and topologically sorted. # This script checks for topological sortedness. @@ -8,6 +8,7 @@ # Note that the output format may not contain any spaces. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +use warnings; use strict; use Getopt::Long; diff --git a/scripts/generic/fsal2fsa.pl b/scripts/generic/fsal2fsa.pl index 36aed0ecd..d1aa461ac 100755 --- a/scripts/generic/fsal2fsa.pl +++ b/scripts/generic/fsal2fsa.pl @@ -1,7 +1,8 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl # A very simple script that converts fsal back to fsa format (openfst lattices) # Ondrej Bojar, bojar@ufal.mff.cuni.cz +use warnings; use strict; while (<>) { diff --git a/scripts/generic/generic-parallel.perl b/scripts/generic/generic-parallel.perl index 2becba31c..653912c5c 100755 --- a/scripts/generic/generic-parallel.perl +++ b/scripts/generic/generic-parallel.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/generic/giza-parallel.perl b/scripts/generic/giza-parallel.perl index 55192af74..8793d3d8e 100755 --- a/scripts/generic/giza-parallel.perl +++ b/scripts/generic/giza-parallel.perl @@ -1,8 +1,9 @@ -#! /usr/bin/perl +#!/usr/bin/env perl # example # ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align +use warnings; use strict; use File::Basename; diff --git a/scripts/generic/lopar2pos.pl b/scripts/generic/lopar2pos.pl index a2b6e93b4..c75069135 100755 --- a/scripts/generic/lopar2pos.pl +++ b/scripts/generic/lopar2pos.pl @@ -1,9 +1,11 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ #lopar2pos: extract POSs from LOPAR output #usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos +use warnings; + my $infilename = shift @ARGV; open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n"; while(my $line = <INFILE>) diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index 47c7551b3..7c0f56c70 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -1,4 +1,4 @@ -#! /usr/bin/perl +#!/usr/bin/env perl # $Id$ ####################### @@ -15,6 +15,7 @@ # added checks for existence of decoder and configuration file # 26 Jul 2006 fix a bug related to the use of absolute path for srcfile and nbestfile +use warnings; use strict; ####################### diff --git a/scripts/generic/mteval-v12.pl b/scripts/generic/mteval-v12.pl index 1010eabfd..360376242 100755 --- a/scripts/generic/mteval-v12.pl +++ b/scripts/generic/mteval-v12.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use utf8; use Encode; diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl index f1f8f9ef6..453c03e19 100755 --- a/scripts/generic/mteval-v13a.pl +++ b/scripts/generic/mteval-v13a.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl use warnings; use strict; diff --git a/scripts/generic/multi-bleu.perl b/scripts/generic/multi-bleu.perl index 94da1504f..2f44d419f 100755 --- a/scripts/generic/multi-bleu.perl +++ b/scripts/generic/multi-bleu.perl @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ +use warnings; use strict; my $lowercase = 0; diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl index a5e5f5a6b..ea56927ac 100755 --- a/scripts/generic/ph_numbers.perl +++ b/scripts/generic/ph_numbers.perl @@ -1,4 +1,5 @@ -#!/usr/bin/perl -w
+#!/usr/bin/env perl
+
package ph_numbers;
# Script to recognize and replace numbers in Moses training corpora
@@ -6,6 +7,7 @@ package ph_numbers; #
# (c) 2013 TAUS
+use warnings;
use strict;
run() unless caller();
diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl index e34c84a74..622323bdb 100755 --- a/scripts/generic/qsub-wrapper.pl +++ b/scripts/generic/qsub-wrapper.pl @@ -1,6 +1,7 @@ -#! /usr/bin/perl +#!/usr/bin/env perl # $Id$ +use warnings; use strict; ####################### diff --git a/scripts/generic/reverse-alignment.perl b/scripts/generic/reverse-alignment.perl index e19ddc9e5..d00140c74 100755 --- a/scripts/generic/reverse-alignment.perl +++ b/scripts/generic/reverse-alignment.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my $line; diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index a5aa4fb4d..9e5ee0025 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -1,9 +1,10 @@ -#! /usr/bin/perl -w +#!/usr/bin/env perl # example # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0 # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f --Inverse 1 +use warnings; use strict; use File::Basename; @@ -13,8 +14,8 @@ sub GetSourcePhrase($); sub NumStr($); sub CutContextFile($$$); -my $GZIP_EXEC; # = which("pigz"); -if(-f "/usr/bin/pigz") { +my $GZIP_EXEC; +if(`which pigz`) { $GZIP_EXEC = 'pigz'; } else { diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl index 40a61302a..95513b608 100755 --- a/scripts/generic/strip-xml.perl +++ b/scripts/generic/strip-xml.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; while (my $line = <STDIN>) { diff --git a/scripts/generic/trainlm-irst2.perl b/scripts/generic/trainlm-irst2.perl index 8ad53e880..596143386 100755 --- a/scripts/generic/trainlm-irst2.perl +++ b/scripts/generic/trainlm-irst2.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # Compatible with sri LM-creating script, eg. # ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt @@ -10,6 +10,7 @@ # irst-dir = /Users/hieu/workspace/irstlm/trunk/bin # Set smoothing method in settings, if different from modified Kneser-Ney +use warnings; use strict; use FindBin qw($RealBin); use Getopt::Long; diff --git a/scripts/generic/trainlm-lmplz.perl b/scripts/generic/trainlm-lmplz.perl deleted file mode 100755 index f9bc0d0da..000000000 --- a/scripts/generic/trainlm-lmplz.perl +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl -w - -# Compatible with sri LM-creating script, eg. -# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt -# To use it in the EMS, add this to the [LM] section -# lm-training = "$moses-script-dir/generic/trainlm-lmplz.perl -lmplz $lmplz" -# settings = "-T $working-dir/tmp -S 10G" -# Also, make sure that $lmplz is defined (in the [LM] or [GENERAL] section. -# It should point to the binary file -# lmplz = /home/waziz/workspace/github/moses/bin/lmplz - -use strict; -use FindBin qw($RealBin); -use Getopt::Long qw/GetOptionsFromArray/; -#use Getopt::Long; -Getopt::Long::Configure("pass_through", "no_ignore_case"); - -my $order = 3; # order of language model (default trigram) -my $corpus; # input text data -my $lm; # generated language model -my $lmplz; # bin directory of IRSTLM -my $help = 0; - -my @optconfig = ( - "-order=s" => \$order, - "-text=s" => \$corpus, - "-lm=s" => \$lm, - "-lmplz=s" => \$lmplz, -); - -GetOptionsFromArray(\@ARGV, @optconfig); -die("ERROR: please set text") unless defined($corpus); -die("ERROR: please set lm") unless defined($lm); -die("ERROR: please set lmplz") unless defined($lmplz); - -my $settings = join(' ', @ARGV); -my $cmd = "$lmplz --order $order $settings < $corpus > $lm"; - -print STDERR "EXECUTING $cmd\n"; -`$cmd`; diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl index 5aa7d4f85..130afd56b 100755 --- a/scripts/other/beautify.perl +++ b/scripts/other/beautify.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/env perl +use warnings; use strict; use File::Basename; use FindBin qw($RealBin); diff --git a/scripts/other/convert-pt.perl b/scripts/other/convert-pt.perl index fa35b4490..f530a447a 100755 --- a/scripts/other/convert-pt.perl +++ b/scripts/other/convert-pt.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # $Id$ # convert a phrase-table with alignment in Moses' dead-end format diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl index 2a4f51c89..08316c95b 100755 --- a/scripts/other/delete-scores.perl +++ b/scripts/other/delete-scores.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/other/get_many_translations_from_google.perl b/scripts/other/get_many_translations_from_google.perl index c9feae9ca..512b84e36 100755 --- a/scripts/other/get_many_translations_from_google.perl +++ b/scripts/other/get_many_translations_from_google.perl @@ -1,10 +1,12 @@ -#!/usr/bin/perl +#!/usr/bin/env perl + # Uses Google AJAX API to collect many translations, i.e. create a parallel # corpus of Google translations. # Expects one sentence per line, not tokenized! # # Ondrej Bojar, bojar@ufal.mff.cuni.cz +use warnings; use strict; use Getopt::Long; use CGI; diff --git a/scripts/other/retain-lines.perl b/scripts/other/retain-lines.perl index 6f7c517c2..b865e1af7 100755 --- a/scripts/other/retain-lines.perl +++ b/scripts/other/retain-lines.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl #retain lines in clean.lines-retained.1 use strict; diff --git a/scripts/other/translate_by_microsoft_bing.perl b/scripts/other/translate_by_microsoft_bing.perl index 50e9a12d2..ad7a9c3b7 100755 --- a/scripts/other/translate_by_microsoft_bing.perl +++ b/scripts/other/translate_by_microsoft_bing.perl @@ -1,4 +1,5 @@ -#!/usr/bin/perl +#!/usr/bin/env perl + # Script implemented by Pranava Swaroop Madhyastha (a student at Charles # University, UFAL) diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl index 012c143ac..549cd8abe 100755 --- a/scripts/recaser/detruecase.perl +++ b/scripts/recaser/detruecase.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl index 2858cda61..3ba83712a 100755 --- a/scripts/recaser/recase.perl +++ b/scripts/recaser/recase.perl @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl index ad75af068..87a720f6e 100755 --- a/scripts/recaser/train-recaser.perl +++ b/scripts/recaser/train-recaser.perl @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ +use warnings; use strict; use FindBin qw($Bin); use Getopt::Long "GetOptions"; diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 59a83ec91..b653a8ca5 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ @@ -8,6 +8,7 @@ # --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token. # +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index ca0cf44ee..373aa509f 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -1,6 +1,8 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ + +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/regression-testing/compare-results.pl b/scripts/regression-testing/compare-results.pl index 744334d29..df14d444f 100755 --- a/scripts/regression-testing/compare-results.pl +++ b/scripts/regression-testing/compare-results.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my ($results, $truth) = @ARGV; diff --git a/scripts/regression-testing/create_localized_moses_ini.pl b/scripts/regression-testing/create_localized_moses_ini.pl index b102a4ed4..612a39e82 100755 --- a/scripts/regression-testing/create_localized_moses_ini.pl +++ b/scripts/regression-testing/create_localized_moses_ini.pl @@ -1,5 +1,6 @@ -#! /usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } use MosesScriptsRegressionTesting; diff --git a/scripts/regression-testing/modify-pars.pl b/scripts/regression-testing/modify-pars.pl index 23576873b..5ad2514a4 100755 --- a/scripts/regression-testing/modify-pars.pl +++ b/scripts/regression-testing/modify-pars.pl @@ -1,5 +1,6 @@ -#! /usr/bin/perl +#!/usr/bin/env perl +use warnings; use strict; my $argv=join(" ",@ARGV); diff --git a/scripts/regression-testing/moses-virtual.pl b/scripts/regression-testing/moses-virtual.pl index 4bb852242..41ddd6b13 100755 --- a/scripts/regression-testing/moses-virtual.pl +++ b/scripts/regression-testing/moses-virtual.pl @@ -1,5 +1,6 @@ -#! /usr/bin/perl +#!/usr/bin/env perl +use warnings; use strict; my %opt = (); diff --git a/scripts/regression-testing/run-single-test.pl b/scripts/regression-testing/run-single-test.pl index 0c5efa3a7..bb66e96f6 100755 --- a/scripts/regression-testing/run-single-test.pl +++ b/scripts/regression-testing/run-single-test.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } use MosesScriptsRegressionTesting; diff --git a/scripts/regression-testing/run-test-suite.pl b/scripts/regression-testing/run-test-suite.pl index 7cdfc21d9..8ae9ec60f 100755 --- a/scripts/regression-testing/run-test-suite.pl +++ b/scripts/regression-testing/run-test-suite.pl @@ -1,5 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/env perl +use warnings; use strict; my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } use Getopt::Long; diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl index ca4e8a1b3..0e73a7718 100755 --- a/scripts/tokenizer/deescape-special-chars-PTB.perl +++ b/scripts/tokenizer/deescape-special-chars-PTB.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; while(<STDIN>) { diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl index 7dc6bc539..076d1e62f 100755 --- a/scripts/tokenizer/deescape-special-chars.perl +++ b/scripts/tokenizer/deescape-special-chars.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; while(<STDIN>) { diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index a8de7e86e..7874d5d04 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $ # Sample De-Tokenizer @@ -7,6 +7,8 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); + +use warnings; use strict; use utf8; # tell perl this script file is in UTF-8 (see all funny punct below) @@ -36,7 +38,7 @@ if ($HELP) { exit; } -if ($language !~ /^(cs|en|fr|it)$/) { +if ($language !~ /^(cs|en|fr|it|fi)$/) { print STDERR "Warning: No built-in rules for language $language.\n" } @@ -176,6 +178,11 @@ sub detokenize { } + } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) { + # Finnish : without intervening space if followed by case suffix + # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ... + $text=$text. lc $words[$i]; + $prependSpace = " "; } else { $text=$text.$prependSpace.$words[$i]; $prependSpace = " "; diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl index 89afdb0e3..e94b91744 100755 --- a/scripts/tokenizer/escape-special-chars.perl +++ b/scripts/tokenizer/escape-special-chars.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; while(<STDIN>) { diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl index c30e029b9..9ee307bc2 100755 --- a/scripts/tokenizer/lowercase.perl +++ b/scripts/tokenizer/lowercase.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; binmode(STDIN, ":utf8"); diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl index c679ab2a7..db8f9c60e 100755 --- a/scripts/tokenizer/normalize-punctuation.perl +++ b/scripts/tokenizer/normalize-punctuation.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my $language = "en"; diff --git a/scripts/tokenizer/pre-tok-clean.perl b/scripts/tokenizer/pre-tok-clean.perl new file mode 100755 index 000000000..900e992ee --- /dev/null +++ b/scripts/tokenizer/pre-tok-clean.perl @@ -0,0 +1,46 @@ +#!/usr/bin/env perl + +use strict; + +my $minChars = $ARGV[0]; +my $maxChars = $ARGV[1]; +my $inputStem = $ARGV[2]; +my $source = $ARGV[3]; +my $target = $ARGV[4]; +my $outputStem = $ARGV[5]; +my $linesRetained = $ARGV[6]; + +open(IN_SOURCE, "<:encoding(UTF-8)", "$inputStem.$source") or die "cannot open $inputStem.$source"; +open(IN_TARGET, "<:encoding(UTF-8)", "$inputStem.$target") or die "cannot open $inputStem.$target"; + +open(OUT_SOURCE, ">:encoding(UTF-8)", "$outputStem.$source") or die "cannot open $outputStem.$source"; +open(OUT_TARGET, ">:encoding(UTF-8)", "$outputStem.$target") or die "cannot open $outputStem.$target"; + +open(LINE_RETAINED, ">:encoding(UTF-8)", "$linesRetained"); + +my $lineNum = 0; +while (my $lineSource = <IN_SOURCE>) { + ++$lineNum; + #print STDERR "$lineNum "; + + chomp($lineSource); + my $lineTarget = <IN_TARGET>; + chomp($lineTarget); + + my $lenSource = length($lineSource); + my $lenTarget = length($lineTarget); + + if ($lenSource < $minChars || $lenSource > $maxChars + || $lenTarget < $minChars || $lenTarget > $maxChars) { + # do nothing + } + else { + print OUT_SOURCE "$lineSource\n"; + print OUT_TARGET "$lineTarget\n"; + print LINE_RETAINED "$lineNum\n"; + } +} + +close(OUT_SOURCE); +close(OUT_SOURCE); +close(LINE_RETAINED); diff --git a/scripts/tokenizer/pre-tokenizer.perl b/scripts/tokenizer/pre-tokenizer.perl index cb6218716..499671b44 100755 --- a/scripts/tokenizer/pre-tokenizer.perl +++ b/scripts/tokenizer/pre-tokenizer.perl @@ -1,8 +1,10 @@ -#!/usr/bin/perl -W +#!/usr/bin/env perl + # script for preprocessing language data prior to tokenization # Start by Ulrich Germann, after noticing systematic preprocessing errors # in some of the English Europarl data. +use warnings; use strict; use Getopt::Std; diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py new file mode 100644 index 000000000..76736da5c --- /dev/null +++ b/scripts/tokenizer/pre_tokenize_cleaning.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python -*- coding: utf-8 -*- + +""" +The Gacha filter cleans out sentence pairs that have global character mean +lower than a certain threshold. + +Use this cleaner to produce low quantity of high quality sentence pairs. + +It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during +WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER. +(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf) + +This is inspired by the global character mean that is used in the Gale-Church +algorithm (Gale aand Church, 1993), the c variable in: + + delta = (l2-l1*c)/math.sqrt(l1*s2) + +where: + - l1 = len(source_sentence) + - l2 = len(target_sentence) + - c = global mean, i.e. #char in source corpus / #char in target corpus + - s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1) + +(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf) +""" + +import io, subprocess + +red = '\033[01;31m' +native = '\033[m' + +def err_msg(txt): + return red+txt+native + +def num_char(filename): + return float(subprocess.Popen(["wc", "-m", filename], + stdout=subprocess.PIPE).stdout.read().split()[0]) + +def gacha_mean(sourcefile, targetfile): + """ + Counts the global character mean between source and target language as + in Gale-Church (1993) + """ + sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n')) + c = num_char(sourcefile) / num_char(targetfile) + sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n')) + sys.stderr.write(err_msg('Filtering starts ...\n')) + return c + +def main(sourcefile, targetfile, threshold=0.2): + # Calculates Gacha mean. + c = gacha_mean(sourcefile, targetfile) + # Calculates lower and upperbound for filtering + threshold = float(threshold) + lowerbound = (1-threshold) * c + upperbound = (1+threshold) * c + + # Start filtering sentences. + with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \ + io.open(targetfile, 'r', encoding='utf8') as trgfin: + for s, t in zip(srcfin, trgfin): + if lowerbound < len(s) / float(len(t)) < upperbound: + print(u"{}\t{}\n".format(s.strip(),t.strip())) + +if __name__ == '__main__': + import sys + if len(sys.argv) not in range(3,5): + usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n' + % sys.argv[0]) + + example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de ' + '~/Europarl.de-en.en 0.4\n' + % sys.argv[0]) + sys.stderr.write(usage_msg) + sys.stderr.write(example_msg) + sys.exit(1) + + main(*sys.argv[1:]) diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl index 2a7bec07b..2b90dfd3b 100755 --- a/scripts/tokenizer/remove-non-printing-char.perl +++ b/scripts/tokenizer/remove-non-printing-char.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/env perl +use warnings; use utf8; binmode(STDIN, ":utf8"); diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl index ab1d5808d..08eb766bf 100755 --- a/scripts/tokenizer/replace-unicode-punctuation.perl +++ b/scripts/tokenizer/replace-unicode-punctuation.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; #binmode(STDIN, ":utf8"); diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index 03143e467..8abffbea4 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -1,4 +1,5 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl + use warnings; # Sample Tokenizer @@ -15,10 +16,15 @@ use warnings; binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); +use warnings; use FindBin qw($RealBin); use strict; use Time::HiRes; -use Thread; + +if (eval {require Thread;1;}) { + #module loaded + Thread->import(); +} my $mydir = "$RealBin/../share/nonbreaking_prefixes"; diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl index e2cce2e4b..bce7a38a0 100755 --- a/scripts/tokenizer/tokenizer_PTB.perl +++ b/scripts/tokenizer/tokenizer_PTB.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # Sample Tokenizer ### Version 1.1 @@ -14,6 +14,7 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); +use warnings; use FindBin qw($RealBin); use strict; use Time::HiRes; diff --git a/scripts/training/absolutize_moses_model.pl b/scripts/training/absolutize_moses_model.pl index 99efafe8e..5c9c0970a 100755 --- a/scripts/training/absolutize_moses_model.pl +++ b/scripts/training/absolutize_moses_model.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # $Id$ # given a moses.ini file, prints a copy to stdout but replaces all relative @@ -6,6 +6,8 @@ # # Ondrej Bojar. +use warnings; + my $ini = shift; die "usage: absolutize_moses_model.pl path-to-moses.ini > moses.abs.ini" if !defined $ini; diff --git a/scripts/training/analyse_moses_model.pl b/scripts/training/analyse_moses_model.pl index 62dab218f..7a3b27e65 100755 --- a/scripts/training/analyse_moses_model.pl +++ b/scripts/training/analyse_moses_model.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # $Id$ # given a moses.ini file, checks the translation and generation tables and reports diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py index 66f8f0413..cd8755580 100755 --- a/scripts/training/bilingual-lm/extract_training.py +++ b/scripts/training/bilingual-lm/extract_training.py @@ -147,7 +147,7 @@ def main(): #Numberize the file for line in ngrams_file_handle: - numberized_file_handle.write(extract.numberize(line, m, n, tvocab_idmap, tvocab_idmap)) + numberized_file_handle.write(extract.numberize(line, options.m, options.n, svocab_idmap, tvocab_idmap)) numberized_file_handle.close() ngrams_file_handle.close() diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index 15ad23ac4..3d4798ffd 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -1,9 +1,10 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # Binarize a Moses model # +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/training/build-generation-table.perl b/scripts/training/build-generation-table.perl index cf707811e..fb59f4acc 100755 --- a/scripts/training/build-generation-table.perl +++ b/scripts/training/build-generation-table.perl @@ -1,6 +1,7 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # $Id$ +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl index 00a56977e..a7ddaff70 100755 --- a/scripts/training/build-mmsapt.perl +++ b/scripts/training/build-mmsapt.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl index 18282858d..e1e96528c 100755 --- a/scripts/training/clean-corpus-n.perl +++ b/scripts/training/clean-corpus-n.perl @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $ +use warnings; use strict; use Getopt::Long; my $help; @@ -49,7 +50,7 @@ my $l1input = "$corpus.$l1"; if (-e $l1input) { $opn = $l1input; } elsif (-e $l1input.".gz") { - $opn = "zcat $l1input.gz |"; + $opn = "gunzip -c $l1input.gz |"; } else { die "Error: $l1input does not exist"; } @@ -59,7 +60,7 @@ my $l2input = "$corpus.$l2"; if (-e $l2input) { $opn = $l2input; } elsif (-e $l2input.".gz") { - $opn = "zcat $l2input.gz |"; + $opn = "gunzip -c $l2input.gz |"; } else { die "Error: $l2input does not exist"; } @@ -154,7 +155,7 @@ print STDERR "Input sentences: $innr Output sentences: $outnr\n"; sub word_count { my ($line) = @_; if ($ignore_xml) { - $line =~ s/<\S[^>]*\S>//g; + $line =~ s/<\S[^>]*\S>/ /g; $line =~ s/\s+/ /g; $line =~ s/^ //g; $line =~ s/ $//g; diff --git a/scripts/training/clone_moses_model.pl b/scripts/training/clone_moses_model.pl index 29aed3f2a..5e9dff72a 100755 --- a/scripts/training/clone_moses_model.pl +++ b/scripts/training/clone_moses_model.pl @@ -1,10 +1,11 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # $Id$ # given a moses.ini file, creates a fresh version of it # in the current directory # All relevant files are hardlinked or copied to the directory, too. +use warnings; use strict; use Getopt::Long; diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl index 8a57a6b57..dfdf020a0 100755 --- a/scripts/training/combine_factors.pl +++ b/scripts/training/combine_factors.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # $Id$ # given a list of files, combines them to a single corpus (sent to stdout) diff --git a/scripts/training/convert-moses-ini-to-v2.perl b/scripts/training/convert-moses-ini-to-v2.perl index 867c7eca7..25c562ef4 100755 --- a/scripts/training/convert-moses-ini-to-v2.perl +++ b/scripts/training/convert-moses-ini-to-v2.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my $header = ""; diff --git a/scripts/training/corpus-sizes.perl b/scripts/training/corpus-sizes.perl index f317c5665..02dd4ae9b 100755 --- a/scripts/training/corpus-sizes.perl +++ b/scripts/training/corpus-sizes.perl @@ -1,7 +1,8 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ +use warnings; use strict; my ($in,$out,@PART) = @ARGV; diff --git a/scripts/training/exodus.perl b/scripts/training/exodus.perl index f5a5cbdea..d3466f5dd 100755 --- a/scripts/training/exodus.perl +++ b/scripts/training/exodus.perl @@ -1,7 +1,8 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ +use warnings; use strict; my @LINE = <STDIN>; diff --git a/scripts/training/get-lexical.perl b/scripts/training/get-lexical.perl index e23c15665..45fe6d54c 100755 --- a/scripts/training/get-lexical.perl +++ b/scripts/training/get-lexical.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/env perl +use warnings; use strict; use FindBin qw($RealBin); BEGIN { require "$RealBin/LexicalTranslationModel.pm"; "LexicalTranslationModel"->import; } diff --git a/scripts/training/giza2bal.pl b/scripts/training/giza2bal.pl index 553ff2b3e..56fc9a466 100755 --- a/scripts/training/giza2bal.pl +++ b/scripts/training/giza2bal.pl @@ -1,4 +1,4 @@ -#! /usr/bin/perl +#!/usr/bin/env perl # $Id$ #Converts direct and inverted alignments into a more compact @@ -7,6 +7,8 @@ #Copyright Marcello Federico, November 2004 +#use warnings; + ($cnt,$dir,$inv)=(); while ($w=shift @ARGV){ @@ -17,7 +19,7 @@ while ($w=shift @ARGV){ my $lc = 0; -if (!$dir || !inv){ +if (!$dir || !$inv){ print "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n"; print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n"; exit(0); diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 04e174c1b..86084abbf 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ # Usage: # mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config> @@ -47,6 +47,7 @@ # 13 Oct 2004 Use alternative decoders (DWC) # Original version by Philipp Koehn +use warnings; use strict; use FindBin qw($RealBin); use File::Basename; diff --git a/scripts/training/postprocess-lopar.perl b/scripts/training/postprocess-lopar.perl index b5ae79b2a..5171e02fb 100755 --- a/scripts/training/postprocess-lopar.perl +++ b/scripts/training/postprocess-lopar.perl @@ -1,7 +1,8 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # $Id$ +use warnings; use strict; use utf8; diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl index c7269abf9..c265652f6 100755 --- a/scripts/training/reduce-factors.perl +++ b/scripts/training/reduce-factors.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); @@ -10,11 +11,12 @@ my $___FACTOR_DELIMITER = "|"; my $ZCAT = "gzip -cd"; my $BZCAT = "bzcat"; -my ($CORPUS,$REDUCED,$FACTOR); +my ($CORPUS,$REDUCED,$FACTOR,$_XML); die("ERROR: wrong syntax when invoking reduce-factors") unless &GetOptions('corpus=s' => \$CORPUS, 'reduced-corpus=s' => \$REDUCED, - 'factor=s' => \$FACTOR); + 'factor=s' => \$FACTOR, + 'xml' => \$_XML); &reduce_factors($CORPUS,$REDUCED,$FACTOR); @@ -24,9 +26,9 @@ sub reduce_factors { my @INCLUDE = sort {$a <=> $b} split(/,/,$factors); - print "Reducing factors to produce $reduced @ ".`date`; + print STDERR "(1.0.5) reducing factors to produce $reduced @ ".`date`; while(-e $reduced.".lock") { - sleep(10); + sleep(10); } if (-e $reduced) { print STDERR " $reduced in place, reusing\n"; @@ -37,29 +39,31 @@ sub reduce_factors { return; } - # peek at input, to check if we are asked to produce exactly the - # available factors - my $inh = open_or_zcat($full); - my $firstline = <$inh>; - die "Corpus file $full is empty" unless $firstline; - close $inh; - # pick first word - $firstline =~ s/^\s*//; - $firstline =~ s/\s.*//; - # count factors - my @WORD = split(/ /,$firstline); - my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]); - my $maxfactorindex = scalar(@FACTOR)-1; - if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { - # create just symlink; preserving compression - my $realfull = $full; - if (!-e $realfull && -e $realfull.".gz") { + unless ($_XML) { + # peek at input, to check if we are asked to produce exactly the + # available factors + my $inh = open_or_zcat($full); + my $firstline = <$inh>; + die "Corpus file $full is empty" unless $firstline; + close $inh; + # pick first word + $firstline =~ s/^\s*//; + $firstline =~ s/\s.*//; + # count factors + my @WORD = split(/ /,$firstline); + my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]); + my $maxfactorindex = scalar(@FACTOR)-1; + if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { + # create just symlink; preserving compression + my $realfull = $full; + if (!-e $realfull && -e $realfull.".gz") { $realfull .= ".gz"; $reduced =~ s/(\.gz)?$/.gz/; - } - safesystem("ln -s '$realfull' '$reduced'") + } + safesystem("ln -s '$realfull' '$reduced'") or die "Failed to create symlink $realfull -> $reduced"; - return; + return; + } } # The default is to select the needed factors @@ -71,23 +75,30 @@ sub reduce_factors { $nr++; print STDERR "." if $nr % 10000 == 0; print STDERR "($nr)" if $nr % 100000 == 0; - chomp; s/ +/ /g; s/^ //; s/ $//; - my $first = 1; - foreach (split) { - my @FACTOR = split /\Q$___FACTOR_DELIMITER/; + s/<\S[^>]*>/ /g if $_XML; # remove xml + chomp; s/ +/ /g; s/^ //; s/ $//; + my $first = 1; + foreach (split) { + my @FACTOR = split /\Q$___FACTOR_DELIMITER/; # \Q causes to disable metacharacters in regex - print OUT " " unless $first; - $first = 0; - my $first_factor = 1; + print OUT " " unless $first; + $first = 0; + my $first_factor = 1; foreach my $outfactor (@INCLUDE) { - print OUT "|" unless $first_factor; + print OUT $___FACTOR_DELIMITER unless $first_factor; $first_factor = 0; my $out = $FACTOR[$outfactor]; die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out; print OUT $out; } - } - print OUT "\n"; + # for(my $factor=0;$factor<=$#FACTOR;$factor++) { + # next unless defined($INCLUDE{$factor}); + # print OUT "|" unless $first_factor; + # $first_factor = 0; + # print OUT $FACTOR[$factor]; + # } + } + print OUT "\n"; } print STDERR "\n"; close(OUT); diff --git a/scripts/training/reduce-topt-count.pl b/scripts/training/reduce-topt-count.pl index 15458b0b5..769f44a7e 100755 --- a/scripts/training/reduce-topt-count.pl +++ b/scripts/training/reduce-topt-count.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # given a moses.ini, filter the phrase tables to contain # only ttable-limit options per source phrase diff --git a/scripts/training/reduce_combine.pl b/scripts/training/reduce_combine.pl index 1c7908454..3d0abf29a 100755 --- a/scripts/training/reduce_combine.pl +++ b/scripts/training/reduce_combine.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # $Id$ # given a pathname to a factored corpus, a list of (numeric) factors to keep diff --git a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl index b12281cf8..bd5d7f1d2 100755 --- a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl +++ b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my ($ttable_file) = @ARGV; diff --git a/scripts/training/strip-xml.perl b/scripts/training/strip-xml.perl new file mode 100755 index 000000000..0f403d15d --- /dev/null +++ b/scripts/training/strip-xml.perl @@ -0,0 +1,17 @@ +#!/usr/bin/env perl + +# strip text file of any XML markup + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +use strict; + +while(<STDIN>) { + s/<\S[^>]*>/ /g; + chomp; + s/ +/ /g; + s/^ //; + print $_; + print "\n"; +} diff --git a/scripts/training/threshold-filter.perl b/scripts/training/threshold-filter.perl index 1d5cfbbb4..a23fb8b5c 100755 --- a/scripts/training/threshold-filter.perl +++ b/scripts/training/threshold-filter.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my %MIN_SCORE; diff --git a/scripts/training/train-global-lexicon-model.perl b/scripts/training/train-global-lexicon-model.perl index f18fb6f2e..0e7d3077d 100755 --- a/scripts/training/train-global-lexicon-model.perl +++ b/scripts/training/train-global-lexicon-model.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use Switch; diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index f92e545be..4c355479c 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); @@ -404,8 +405,8 @@ else { $SORT_EXEC = 'sort'; } -my $GZIP_EXEC; # = which("pigz"); -if(-f "/usr/bin/pigz") { +my $GZIP_EXEC; +if(`which pigz`) { $GZIP_EXEC = 'pigz'; } else { diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl index adb34df2f..3dd8fc4ac 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; while(<STDIN>) { diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl index e447ee146..e61a53652 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; while(<STDIN>) { diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py index 69ee4f737..0e361df0b 100755 --- a/scripts/training/wrappers/conll2mosesxml.py +++ b/scripts/training/wrappers/conll2mosesxml.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich diff --git a/scripts/training/wrappers/filter-excluded-lines.perl b/scripts/training/wrappers/filter-excluded-lines.perl index 16584ca8f..7f9da3efa 100755 --- a/scripts/training/wrappers/filter-excluded-lines.perl +++ b/scripts/training/wrappers/filter-excluded-lines.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long; diff --git a/scripts/training/wrappers/find-unparseable.perl b/scripts/training/wrappers/find-unparseable.perl index 0aa560815..b0d38027b 100755 --- a/scripts/training/wrappers/find-unparseable.perl +++ b/scripts/training/wrappers/find-unparseable.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my $lineNum = 1; diff --git a/scripts/training/wrappers/mada-wrapper.perl b/scripts/training/wrappers/mada-wrapper.perl index dd83346ca..20f76f821 100755 --- a/scripts/training/wrappers/mada-wrapper.perl +++ b/scripts/training/wrappers/mada-wrapper.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use File::Temp qw/tempfile/; use Getopt::Long "GetOptions"; diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl new file mode 100755 index 000000000..6e7efe245 --- /dev/null +++ b/scripts/training/wrappers/madamira-wrapper.perl @@ -0,0 +1,93 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use File::Temp qw/tempfile/; +use Getopt::Long "GetOptions"; +use File::Basename; +use FindBin qw($RealBin); +use Cwd 'abs_path'; + +my $TMPDIR = "tmp"; +my $SCHEME = "D2"; +my $KEEP_TMP = 0; +my $MADA_DIR; + +GetOptions( + "scheme=s" => \$SCHEME, + "tmpdir=s" => \$TMPDIR, + "keep-tmp" => \$KEEP_TMP, + "mada-dir=s" => \$MADA_DIR + ) or die("ERROR: unknown options"); + +$TMPDIR = abs_path($TMPDIR); +print STDERR "TMPDIR=$TMPDIR \n"; + +#binmode(STDIN, ":utf8"); +#binmode(STDOUT, ":utf8"); + +$TMPDIR = "$TMPDIR/madamira.$$"; +`mkdir -p $TMPDIR`; +`mkdir -p $TMPDIR/split`; +`mkdir -p $TMPDIR/out`; + +my $infile = "$TMPDIR/input"; +print STDERR $infile."\n"; + +open(TMP,">$infile"); +while(<STDIN>) { + print TMP $_; +} +close(TMP); + +my $cmd; + +# split input file +my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; +if($SPLIT_EXEC) { + $SPLIT_EXEC = 'gsplit'; +} +else { + $SPLIT_EXEC = 'split'; +} + +$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x"; +`$cmd`; + +$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml ::: $TMPDIR/split/x*"; +print STDERR "Executing: $cmd\n"; +`$cmd`; + +$cmd = "cat $TMPDIR/out/x*.mada > $infile.mada"; +print STDERR "Executing: $cmd\n"; +`$cmd`; + +# get stuff out of mada output +open(MADA_OUT,"<$infile.mada"); +#binmode(MADA_OUT, ":utf8"); +while(my $line = <MADA_OUT>) { + chop($line); + #print STDERR "line=$line \n"; + + if (index($line, "SENTENCE BREAK") == 0) { + # new sentence + #print STDERR "BREAK\n"; + print "\n"; + } + elsif (index($line, ";;WORD") == 0) { + # word + my $word = substr($line, 7, length($line) - 8); + #print STDERR "FOund $word\n"; + print "$word "; + } + else { + #print STDERR "NADA\n"; + } +} +close (MADA_OUT); + + +if ($KEEP_TMP == 0) { +# `rm -rf $TMPDIR`; +} + diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl index 38e331737..88d16b3f6 100755 --- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl +++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my ($lowercase, $cluster_file,$in,$out,$tmp) = @ARGV; diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl index d96a1ce00..1cc917bce 100755 --- a/scripts/training/wrappers/make-factor-de-morph.perl +++ b/scripts/training/wrappers/make-factor-de-morph.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
use Encode;
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl index 459961c77..2eadd4123 100755 --- a/scripts/training/wrappers/make-factor-de-pos.perl +++ b/scripts/training/wrappers/make-factor-de-pos.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w
+#!/usr/bin/env perl
+use warnings;
use strict;
my ($in,$out,$tmpdir) = @ARGV;
diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl index c3c309bad..0d27aa12f 100755 --- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl +++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl index d2b5a755c..2af6eb75c 100755 --- a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl +++ b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; # handle switches diff --git a/scripts/training/wrappers/make-factor-stem.perl b/scripts/training/wrappers/make-factor-stem.perl index 892c1636c..60aca0b34 100755 --- a/scripts/training/wrappers/make-factor-stem.perl +++ b/scripts/training/wrappers/make-factor-stem.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my ($size,$in,$out) = @ARGV; diff --git a/scripts/training/wrappers/make-factor-suffix.perl b/scripts/training/wrappers/make-factor-suffix.perl index 20247a013..7e864ea0c 100755 --- a/scripts/training/wrappers/make-factor-suffix.perl +++ b/scripts/training/wrappers/make-factor-suffix.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; my ($size,$in,$out) = @ARGV; diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl index ef6e66024..fc1f0c532 100755 --- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl +++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; #( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) ) diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py new file mode 100755 index 000000000..bd876f087 --- /dev/null +++ b/scripts/training/wrappers/mosesxml2brackets.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +# convert trees in moses XML format to PTB-style bracketed format + +from __future__ import print_function, unicode_literals +import sys +import codecs + +from lxml import etree as ET + +def escape(word): + word = word.replace('|','|') # factor separator + word = word.replace('[','[') # syntax non-terminal + word = word.replace(']',']') # syntax non-terminal + word = word.replace('\'',''') + word = word.replace('\"','"') + + return word + +def make_brackets(xml): + + out = ' [' + xml.get('label') + + if xml.text and xml.text.strip(): + word = escape(xml.text.strip()) + out += ' ' + word + ']' + + else: + for child in xml: + out += make_brackets(child) + + out += ']' + + return out + + +if __name__ == '__main__': + + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + + for line in sys.stdin: + if line == '\n': + sys.stdout.write(line) + continue + out = make_brackets(ET.fromstring(line)).strip() + sys.stdout.write(out + '\n') diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl index 03d90eaca..68df07c49 100755 --- a/scripts/training/wrappers/parse-de-berkeley.perl +++ b/scripts/training/wrappers/parse-de-berkeley.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/parse-de-bitpar.perl b/scripts/training/wrappers/parse-de-bitpar.perl index f884b5c01..4723d6aa0 100755 --- a/scripts/training/wrappers/parse-de-bitpar.perl +++ b/scripts/training/wrappers/parse-de-bitpar.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/parse-en-collins.perl b/scripts/training/wrappers/parse-en-collins.perl index d71f1a293..27b33a2dd 100755 --- a/scripts/training/wrappers/parse-en-collins.perl +++ b/scripts/training/wrappers/parse-en-collins.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use File::Basename; use File::Temp qw/tempfile/; diff --git a/scripts/training/wrappers/parse-en-egret.perl b/scripts/training/wrappers/parse-en-egret.perl index 70403c970..c3d23a4ee 100755 --- a/scripts/training/wrappers/parse-en-egret.perl +++ b/scripts/training/wrappers/parse-en-egret.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl index d78106fe2..1bb616939 100755 --- a/scripts/training/wrappers/syntax-hyphen-splitting.perl +++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/training/wrappers/tagger-german-chunk.perl b/scripts/training/wrappers/tagger-german-chunk.perl index 1e4b5495d..4f26efabe 100755 --- a/scripts/training/wrappers/tagger-german-chunk.perl +++ b/scripts/training/wrappers/tagger-german-chunk.perl @@ -1,5 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; |