Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--biconcor/phrase-lookup.cpp11
-rw-r--r--contrib/lmserver/examples/lmclient.cc4
-rwxr-xr-xcontrib/mada/qsub-madamira.perl46
-rw-r--r--contrib/mira/Main.cpp2
-rw-r--r--contrib/mira/Main.h4
-rw-r--r--contrib/other-builds/manual-label/manual-label.project23
-rw-r--r--contrib/other-builds/moses-cmd/moses-cmd.project35
-rw-r--r--contrib/other-builds/moses/moses.project2
-rwxr-xr-xcontrib/relent-filter/src/Main.cpp3
-rw-r--r--contrib/server/mosesserver.cpp2
-rw-r--r--mert/Data.cpp3
-rw-r--r--mert/Fdstream.h2
-rw-r--r--mert/FileStream.cpp25
-rw-r--r--mert/FileStream.h16
-rw-r--r--mert/ForestRescoreTest.cpp102
-rw-r--r--mert/MeteorScorer.cpp3
-rw-r--r--mert/Point.cpp7
-rw-r--r--mert/TODO7
-rw-r--r--mert/TimerTest.cpp15
-rw-r--r--mert/evaluator.cpp24
-rw-r--r--mert/hgtest/0.gzbin0 -> 148523 bytes
-rw-r--r--mert/kbmira.cpp5
-rw-r--r--mert/mert.cpp5
-rw-r--r--mert/pro.cpp9
-rw-r--r--moses-cmd/MainVW.cpp3
-rw-r--r--moses/ExportInterface.cpp5
-rw-r--r--moses/FF/LexicalReordering/SparseReordering.cpp58
-rw-r--r--moses/FF/LexicalReordering/SparseReordering.h6
-rw-r--r--moses/FF/VW/VW.h4
-rw-r--r--moses/HypergraphOutput.cpp1
-rw-r--r--moses/LM/Remote.cpp17
-rw-r--r--moses/LM/Remote.h8
-rw-r--r--moses/Manager.cpp5
-rw-r--r--moses/Parameter.cpp3
-rw-r--r--moses/Syntax/F2S/HyperTreeLoader.cpp24
-rw-r--r--moses/Syntax/F2S/HyperTreeLoader.h10
-rw-r--r--moses/Syntax/F2S/Manager-inl.h24
-rw-r--r--moses/Syntax/F2S/Manager.h3
-rw-r--r--moses/Syntax/RuleTableFF.cpp3
-rw-r--r--moses/Syntax/RuleTableFF.h7
-rw-r--r--moses/TranslationModel/CompactPT/MmapAllocator.h40
-rw-r--r--moses/TranslationModel/DynSAInclude/FileHandler.cpp4
-rw-r--r--moses/TranslationModel/DynSAInclude/hash.h14
-rw-r--r--moses/TranslationModel/DynSAInclude/onlineRLM.h3
-rw-r--r--moses/TranslationModel/DynSAInclude/utils.h16
-rw-r--r--moses/TranslationModel/DynSuffixArray.cpp38
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp26
-rw-r--r--moses/TranslationModel/PhraseDictionaryTransliteration.cpp19
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp9
-rw-r--r--moses/TranslationModel/UG/generic/sampling/Sampling.h17
-rw-r--r--moses/TranslationModel/UG/mm/ug_mmbitext.cc2
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_array_entry.h2
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h12
-rw-r--r--moses/TranslationModel/UG/mmsapt.cpp6
-rw-r--r--moses/TrellisPath.cpp40
-rw-r--r--moses/TrellisPath.h14
-rw-r--r--moses/Util.h6
-rw-r--r--moses/mbr.cpp4
-rw-r--r--moses/server/TranslationRequest.cpp2
-rw-r--r--phrase-extract/DomainFeature.cpp5
-rw-r--r--phrase-extract/DomainFeature.h2
-rw-r--r--phrase-extract/SentenceAlignment.cpp7
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.cpp8
-rw-r--r--phrase-extract/consolidate-direct-main.cpp5
-rw-r--r--phrase-extract/consolidate-reverse-main.cpp10
-rw-r--r--phrase-extract/extract-ghkm/XmlTreeParser.cpp3
-rw-r--r--phrase-extract/pcfg-common/xml_tree_parser.cc3
-rw-r--r--phrase-extract/relax-parse-main.cpp5
-rw-r--r--phrase-extract/relax-parse.h2
-rw-r--r--phrase-extract/statistics-main.cpp5
-rw-r--r--phrase-extract/syntax-common/xml_tree_parser.cc3
-rw-r--r--phrase-extract/tables-core.cpp31
-rw-r--r--phrase-extract/tables-core.h2
-rwxr-xr-xscripts/OSM/OSM-Train.perl1
-rwxr-xr-xscripts/OSM/extract-singletons.perl1
-rwxr-xr-xscripts/OSM/flipAlignment.perl4
-rwxr-xr-xscripts/Transliteration/clean.pl1
-rwxr-xr-xscripts/Transliteration/corpusCreator.pl1
-rwxr-xr-xscripts/Transliteration/in-decoding-transliteration.pl1
-rwxr-xr-xscripts/Transliteration/post-decoding-transliteration.pl1
-rwxr-xr-xscripts/Transliteration/prepare-transliteration-phrase-table.pl1
-rwxr-xr-xscripts/Transliteration/threshold.pl1
-rwxr-xr-xscripts/Transliteration/train-transliteration-module.pl1
-rwxr-xr-xscripts/analysis/bootstrap-hypothesis-difference-significance.pl1
-rwxr-xr-xscripts/analysis/sentence-by-sentence.pl1
-rwxr-xr-xscripts/analysis/sg2dot.perl1
-rwxr-xr-xscripts/analysis/show-phrases-used.pl2
-rwxr-xr-xscripts/analysis/smtgui/filter-phrase-table.pl1
-rw-r--r--scripts/ems/experiment.meta42
-rwxr-xr-xscripts/ems/experiment.perl1
-rwxr-xr-xscripts/ems/fix-info.perl1
-rwxr-xr-xscripts/ems/support/analysis.perl1
-rwxr-xr-xscripts/ems/support/build-domain-file-from-subcorpora.perl1
-rwxr-xr-xscripts/ems/support/build-sparse-features.perl1
-rwxr-xr-xscripts/ems/support/consolidate-training-data.perl1
-rwxr-xr-xscripts/ems/support/generic-multicore-parallelizer.perl1
-rwxr-xr-xscripts/ems/support/generic-parallelizer.perl1
-rwxr-xr-xscripts/ems/support/input-from-sgm.perl1
-rwxr-xr-xscripts/ems/support/interpolate-lm.perl1
-rwxr-xr-xscripts/ems/support/lmplz-wrapper.perl8
-rwxr-xr-xscripts/ems/support/mml-filter.perl1
-rwxr-xr-xscripts/ems/support/mml-score.perl1
-rwxr-xr-xscripts/ems/support/mml-train.perl1
-rwxr-xr-xscripts/ems/support/prepare-fast-align.perl3
-rwxr-xr-xscripts/ems/support/reference-from-sgm.perl1
-rwxr-xr-xscripts/ems/support/remove-segmentation-markup.perl1
-rwxr-xr-xscripts/ems/support/report-experiment-scores.perl1
-rwxr-xr-xscripts/ems/support/run-command-on-multiple-refsets.perl1
-rwxr-xr-xscripts/ems/support/run-wade.perl1
-rwxr-xr-xscripts/ems/support/split-sentences.perl1
-rwxr-xr-xscripts/ems/support/submit-grid.perl3
-rwxr-xr-xscripts/ems/support/substitute-filtered-tables-and-weights.perl1
-rwxr-xr-xscripts/ems/support/substitute-filtered-tables.perl2
-rwxr-xr-xscripts/ems/support/substitute-weights.perl2
-rwxr-xr-xscripts/ems/support/symmetrize-fast-align.perl1
-rwxr-xr-xscripts/ems/support/thot-lm-wrapper.perl1
-rwxr-xr-xscripts/ems/support/wrap-xml.perl1
-rw-r--r--scripts/ems/web/analysis.php8
-rw-r--r--scripts/ems/web/base64.js285
-rw-r--r--scripts/ems/web/bilingual-concordance.css1
-rw-r--r--scripts/ems/web/index.php2
-rw-r--r--scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc6
-rw-r--r--scripts/ems/web/overview.php5
-rwxr-xr-xscripts/ems/web/progress.perl1
-rwxr-xr-xscripts/fuzzy-match/create_xml.perl1
-rwxr-xr-xscripts/generic/compound-splitter.perl1
-rwxr-xr-xscripts/generic/extract-factors.pl1
-rwxr-xr-xscripts/generic/extract-parallel.perl5
-rwxr-xr-xscripts/generic/fsa2fsal.pl1
-rwxr-xr-xscripts/generic/fsa2plf.pl1
-rwxr-xr-xscripts/generic/fsal2fsa.pl1
-rwxr-xr-xscripts/generic/generic-parallel.perl1
-rwxr-xr-xscripts/generic/giza-parallel.perl1
-rwxr-xr-xscripts/generic/lopar2pos.pl2
-rwxr-xr-xscripts/generic/moses-parallel.pl1
-rwxr-xr-xscripts/generic/mteval-v12.pl1
-rwxr-xr-xscripts/generic/multi-bleu.perl1
-rwxr-xr-xscripts/generic/ph_numbers.perl1
-rwxr-xr-xscripts/generic/qsub-wrapper.pl1
-rwxr-xr-xscripts/generic/reverse-alignment.perl1
-rwxr-xr-xscripts/generic/score-parallel.perl5
-rwxr-xr-xscripts/generic/strip-xml.perl1
-rwxr-xr-xscripts/generic/trainlm-irst2.perl1
-rwxr-xr-xscripts/generic/trainlm-lmplz.perl40
-rwxr-xr-xscripts/other/beautify.perl1
-rwxr-xr-xscripts/other/delete-scores.perl1
-rwxr-xr-xscripts/other/get_many_translations_from_google.perl1
-rwxr-xr-xscripts/recaser/detruecase.perl1
-rwxr-xr-xscripts/recaser/recase.perl1
-rwxr-xr-xscripts/recaser/train-recaser.perl1
-rwxr-xr-xscripts/recaser/train-truecaser.perl1
-rwxr-xr-xscripts/recaser/truecase.perl2
-rwxr-xr-xscripts/regression-testing/compare-results.pl1
-rwxr-xr-xscripts/regression-testing/create_localized_moses_ini.pl1
-rwxr-xr-xscripts/regression-testing/modify-pars.pl1
-rwxr-xr-xscripts/regression-testing/moses-virtual.pl1
-rwxr-xr-xscripts/regression-testing/run-single-test.pl1
-rwxr-xr-xscripts/regression-testing/run-test-suite.pl1
-rwxr-xr-xscripts/tokenizer/deescape-special-chars-PTB.perl1
-rwxr-xr-xscripts/tokenizer/deescape-special-chars.perl1
-rwxr-xr-xscripts/tokenizer/detokenizer.perl9
-rwxr-xr-xscripts/tokenizer/escape-special-chars.perl1
-rwxr-xr-xscripts/tokenizer/lowercase.perl1
-rwxr-xr-xscripts/tokenizer/normalize-punctuation.perl1
-rwxr-xr-xscripts/tokenizer/pre-tok-clean.perl46
-rwxr-xr-xscripts/tokenizer/pre-tokenizer.perl1
-rwxr-xr-xscripts/tokenizer/remove-non-printing-char.perl1
-rwxr-xr-xscripts/tokenizer/replace-unicode-punctuation.perl1
-rwxr-xr-xscripts/tokenizer/tokenizer.perl1
-rwxr-xr-xscripts/tokenizer/tokenizer_PTB.perl1
-rwxr-xr-xscripts/training/absolutize_moses_model.pl2
-rwxr-xr-xscripts/training/bilingual-lm/extract_training.py2
-rwxr-xr-xscripts/training/binarize-model.perl1
-rwxr-xr-xscripts/training/build-generation-table.perl1
-rwxr-xr-xscripts/training/build-mmsapt.perl1
-rwxr-xr-xscripts/training/clean-corpus-n.perl1
-rwxr-xr-xscripts/training/clone_moses_model.pl1
-rwxr-xr-xscripts/training/convert-moses-ini-to-v2.perl1
-rwxr-xr-xscripts/training/corpus-sizes.perl1
-rwxr-xr-xscripts/training/exodus.perl1
-rwxr-xr-xscripts/training/filter-model-given-input.pl1
-rwxr-xr-xscripts/training/get-lexical.perl1
-rwxr-xr-xscripts/training/giza2bal.pl4
-rwxr-xr-xscripts/training/mert-moses.pl1
-rwxr-xr-xscripts/training/postprocess-lopar.perl1
-rwxr-xr-xscripts/training/reduce-factors.perl1
-rwxr-xr-xscripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl1
-rwxr-xr-xscripts/training/strip-xml.perl17
-rwxr-xr-xscripts/training/threshold-filter.perl1
-rwxr-xr-xscripts/training/train-global-lexicon-model.perl1
-rwxr-xr-xscripts/training/train-model.perl7
-rwxr-xr-xscripts/training/wrappers/berkeleyparsed2mosesxml.perl1
-rwxr-xr-xscripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl1
-rwxr-xr-xscripts/training/wrappers/filter-excluded-lines.perl1
-rwxr-xr-xscripts/training/wrappers/find-unparseable.perl1
-rwxr-xr-xscripts/training/wrappers/mada-wrapper.perl1
-rwxr-xr-xscripts/training/wrappers/madamira-wrapper.perl93
-rwxr-xr-xscripts/training/wrappers/make-factor-brown-cluster-mkcls.perl1
-rwxr-xr-xscripts/training/wrappers/make-factor-de-morph.perl1
-rwxr-xr-xscripts/training/wrappers/make-factor-de-pos.perl1
-rwxr-xr-xscripts/training/wrappers/make-factor-en-pos.mxpost.perl1
-rwxr-xr-xscripts/training/wrappers/make-factor-pos.tree-tagger.perl1
-rwxr-xr-xscripts/training/wrappers/make-factor-stem.perl1
-rwxr-xr-xscripts/training/wrappers/make-factor-suffix.perl1
-rwxr-xr-xscripts/training/wrappers/mosesxml2berkeleyparsed.perl1
-rwxr-xr-xscripts/training/wrappers/parse-de-berkeley.perl1
-rwxr-xr-xscripts/training/wrappers/parse-de-bitpar.perl1
-rwxr-xr-xscripts/training/wrappers/parse-en-collins.perl1
-rwxr-xr-xscripts/training/wrappers/parse-en-egret.perl1
-rwxr-xr-xscripts/training/wrappers/syntax-hyphen-splitting.perl1
-rwxr-xr-xscripts/training/wrappers/tagger-german-chunk.perl1
-rw-r--r--symal/symal.cpp69
-rw-r--r--util/Jamfile4
-rw-r--r--util/mmap.hh5
-rw-r--r--util/random.cc43
-rw-r--r--util/random.hh229
-rw-r--r--util/random_test.cc191
-rw-r--r--util/tempfile.hh151
-rw-r--r--util/tempfile_test.cc119
-rw-r--r--util/tokenize.hh51
-rw-r--r--util/tokenize_test.cc69
-rw-r--r--util/unistd.hh2
222 files changed, 2014 insertions, 538 deletions
diff --git a/biconcor/phrase-lookup.cpp b/biconcor/phrase-lookup.cpp
index 3ef82e73a..60ab8db66 100644
--- a/biconcor/phrase-lookup.cpp
+++ b/biconcor/phrase-lookup.cpp
@@ -109,14 +109,17 @@ size_t lookup( string query )
return suffixArray.Count( queryString );
}
-vector<string> tokenize( const char input[] )
+// Duplicate of definition in util/tokenize.hh.
+// TODO: Can we de-duplicate this? At the time of writing biconcor does not
+// use util at all.
+vector<string> tokenize(const char input[])
{
vector< string > token;
bool betweenWords = true;
int start=0;
- int i=0;
- for(; input[i] != '\0'; i++) {
- bool isSpace = (input[i] == ' ' || input[i] == '\t');
+ int i;
+ for(i = 0; input[i] != '\0'; i++) {
+ const bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
diff --git a/contrib/lmserver/examples/lmclient.cc b/contrib/lmserver/examples/lmclient.cc
index b26984df9..0d9fc23ff 100644
--- a/contrib/lmserver/examples/lmclient.cc
+++ b/contrib/lmserver/examples/lmclient.cc
@@ -45,8 +45,8 @@ struct LMClient {
exit(1);
}
- bzero((char *)&server, sizeof(server));
- bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
+ memset(&server, '\0', sizeof(server));
+ memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
server.sin_family = hp->h_addrtype;
server.sin_port = htons(port);
diff --git a/contrib/mada/qsub-madamira.perl b/contrib/mada/qsub-madamira.perl
new file mode 100755
index 000000000..bb7ecd06b
--- /dev/null
+++ b/contrib/mada/qsub-madamira.perl
@@ -0,0 +1,46 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use File::Slurp;
+use File::Basename;
+use Cwd 'abs_path';
+
+my $splitDir = $ARGV[0];
+$splitDir = abs_path($splitDir);
+
+my @files = read_dir $splitDir;
+
+my $qsubDir=dirname($splitDir) ."/qsub";
+print STDERR "qsubDir=$qsubDir\n";
+`mkdir -p $qsubDir`;
+
+my $out2Dir=dirname($splitDir) ."/out2";
+print STDERR "out2Dir=$out2Dir\n";
+`mkdir -p $out2Dir`;
+
+for my $file ( @files ) {
+ print STDERR "$file ";
+
+ my $qsubFile = "$qsubDir/$file.sh";
+ open(RUN_FILE, ">$qsubFile");
+
+ print RUN_FILE "#!/usr/bin/env bash\n"
+ ."#PBS -d/scratch/hh65/workspace/experiment/ar-en \n"
+ ."#PBS -l mem=5gb \n\n"
+ ."export PATH=\"/scratch/statmt/bin:/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1/bin:/share/apps/NYUAD/jdk/1.8.0_31/bin:/share/apps/NYUAD/zlib/gcc_4.9.1/1.2.8/bin:/share/apps/NYUAD/cmake/gcc_4.9.1/3.1.0-rc3/bin:/share/apps/NYUAD/boost/gcc_4.9.1/openmpi_1.8.3/1.57.0/bin:/share/apps/NYUAD/openmpi/gcc_4.9.1/1.8.3/bin:/share/apps/NYUAD/python/gcc_4.9.1/2.7.9/bin:/share/apps/NYUAD/gcc/binutils/2.21/el6/bin:/share/apps/NYUAD/gcc/gcc/4.9.1/el6/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/opt/bio/ncbi/bin:/opt/bio/mpiblast/bin:/opt/bio/EMBOSS/bin:/opt/bio/clustalw/bin:/opt/bio/tcoffee/bin:/opt/bio/hmmer/bin:/opt/bio/phylip/exe:/opt/bio/mrbayes:/opt/bio/fasta:/opt/bio/glimmer/bin:/opt/bio/glimmer/scripts:/opt/bio/gromacs/bin:/opt/bio/gmap/bin:/opt/bio/tigr/bin:/opt/bio/autodocksuite/bin:/opt/bio/wgs/bin:/opt/ganglia/bin:/opt/ganglia/sbin:/opt/bin:/usr/java/latest/bin:/opt/pdsh/bin:/opt/rocks/bin:/opt/rocks/sbin:/opt/torque/bin:/opt/torque/sbin:/home/hh65/bin:/home/hh65/bin\" \n"
+
+ ."module load NYUAD/2.0 \n"
+ ."module load gcc python/2.7.9 openmpi/1.8.3 boost cmake zlib jdk perl expat \n"
+
+ ."cd /scratch/statmt/MADAMIRA-release-20140709-1.0 \n";
+ print RUN_FILE "java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar /scratch/statmt/MADAMIRA-release-20140709-1.0/MADAMIRA.jar "
+ ."-rawinput $splitDir/$file -rawoutdir $out2Dir -rawconfig /scratch/statmt/MADAMIRA-release-20140709-1.0/samples/sampleConfigFile.xml \n";
+
+ close(RUN_FILE);
+
+ my $cmd = "qsub $qsubFile";
+ `$cmd`;
+
+}
+
diff --git a/contrib/mira/Main.cpp b/contrib/mira/Main.cpp
index abf92b598..acc2f8886 100644
--- a/contrib/mira/Main.cpp
+++ b/contrib/mira/Main.cpp
@@ -46,6 +46,7 @@ namespace mpi = boost::mpi;
#include "moses/FF/PhrasePairFeature.h"
#include "moses/FF/WordPenaltyProducer.h"
#include "moses/LM/Base.h"
+#include "util/random.hh"
using namespace Mira;
using namespace std;
@@ -54,6 +55,7 @@ namespace po = boost::program_options;
int main(int argc, char** argv)
{
+ util::rand_init();
size_t rank = 0;
size_t size = 1;
#ifdef MPI_ENABLE
diff --git a/contrib/mira/Main.h b/contrib/mira/Main.h
index 8736257f6..b8faedae7 100644
--- a/contrib/mira/Main.h
+++ b/contrib/mira/Main.h
@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/Word.h"
#include "moses/FF/FeatureFunction.h"
#include "Decoder.h"
+#include "util/random.hh"
typedef std::map<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightMap;
typedef std::pair<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightPair;
@@ -37,8 +38,7 @@ template <class T> bool from_string(T& t, const std::string& s, std::ios_base& (
struct RandomIndex {
ptrdiff_t operator()(ptrdiff_t max) {
- srand(time(0)); // Initialize random number generator with current time.
- return static_cast<ptrdiff_t> (rand() % max);
+ return util::rand_excl(max);
}
};
diff --git a/contrib/other-builds/manual-label/manual-label.project b/contrib/other-builds/manual-label/manual-label.project
index 2bc69a6ca..3e3efcddb 100644
--- a/contrib/other-builds/manual-label/manual-label.project
+++ b/contrib/other-builds/manual-label/manual-label.project
@@ -1,5 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="manual-label" InternalType="Console">
+ <Plugins>
+ <Plugin Name="CMakePlugin">
+ <![CDATA[[{
+ "name": "Debug",
+ "enabled": false,
+ "buildDirectory": "build",
+ "sourceDirectory": "$(ProjectPath)",
+ "generator": "",
+ "buildType": "",
+ "arguments": [],
+ "parentProject": ""
+ }]]]>
+ </Plugin>
+ <Plugin Name="qmake">
+ <![CDATA[00010001N0005Debug000000000000]]>
+ </Plugin>
+ </Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="manual-label">
@@ -14,6 +31,8 @@
<File Name="Main.cpp"/>
<File Name="Main.h"/>
</VirtualDirectory>
+ <Dependencies Name="Debug"/>
+ <Dependencies Name="Release"/>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@@ -33,6 +52,8 @@
<Linker Options="" Required="yes">
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
<Library Value="boost_program_options"/>
+ <Library Value="boost_filesystem"/>
+ <Library Value="boost_system"/>
</Linker>
<ResourceCompiler Options="" Required="no"/>
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
@@ -107,6 +128,4 @@
</Completion>
</Configuration>
</Settings>
- <Dependencies Name="Debug"/>
- <Dependencies Name="Release"/>
</CodeLite_Project>
diff --git a/contrib/other-builds/moses-cmd/moses-cmd.project b/contrib/other-builds/moses-cmd/moses-cmd.project
index b978b451e..ecef4038b 100644
--- a/contrib/other-builds/moses-cmd/moses-cmd.project
+++ b/contrib/other-builds/moses-cmd/moses-cmd.project
@@ -1,5 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="moses-cmd" InternalType="Console">
+ <Plugins>
+ <Plugin Name="CMakePlugin">
+ <![CDATA[[{
+ "name": "Debug",
+ "enabled": false,
+ "buildDirectory": "build",
+ "sourceDirectory": "$(ProjectPath)",
+ "generator": "",
+ "buildType": "",
+ "arguments": [],
+ "parentProject": ""
+ }]]]>
+ </Plugin>
+ <Plugin Name="qmake">
+ <![CDATA[00010001N0005Debug000000000000]]>
+ </Plugin>
+ </Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="src"/>
@@ -9,6 +26,14 @@
<File Name="../../../moses-cmd/MainVW.cpp" ExcludeProjConfig="Debug"/>
<File Name="../../../moses-cmd/MainVW.h" ExcludeProjConfig="Debug"/>
</VirtualDirectory>
+ <Dependencies Name="Release"/>
+ <Dependencies Name="Debug">
+ <Project Name="OnDiskPt"/>
+ <Project Name="lm"/>
+ <Project Name="moses"/>
+ <Project Name="search"/>
+ <Project Name="util"/>
+ </Dependencies>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@@ -53,7 +78,7 @@
<Library Value="rt"/>
</Linker>
<ResourceCompiler Options="" Required="no"/>
- <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
+ <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="-f /var/folders/c4/2p48fcwx611dmkdqq44mbblm0000gn/T/ZVd8xvuJAR.ini -i /Users/hieu/workspace/github/moses-regression-tests/tests/phrase.basic-surface-binptable.oldformat/to-translate.txt" UseSeparateDebugArgs="yes" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
<Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
<![CDATA[]]>
</Environment>
@@ -125,12 +150,4 @@
</Completion>
</Configuration>
</Settings>
- <Dependencies Name="Release"/>
- <Dependencies Name="Debug">
- <Project Name="OnDiskPt"/>
- <Project Name="lm"/>
- <Project Name="moses"/>
- <Project Name="search"/>
- <Project Name="util"/>
- </Dependencies>
</CodeLite_Project>
diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project
index 7d666558f..55bf4e8f1 100644
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@@ -474,8 +474,6 @@
<File Name="../../../moses/FF/DistortionScoreProducer.h"/>
<File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.cpp"/>
<File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.h"/>
- <File Name="../../../moses/FF/ExternalFeature.cpp"/>
- <File Name="../../../moses/FF/ExternalFeature.h"/>
<File Name="../../../moses/FF/Factory.cpp"/>
<File Name="../../../moses/FF/Factory.h"/>
<File Name="../../../moses/FF/FeatureFunction.cpp"/>
diff --git a/contrib/relent-filter/src/Main.cpp b/contrib/relent-filter/src/Main.cpp
index 1f86e2cc7..3c7911248 100755
--- a/contrib/relent-filter/src/Main.cpp
+++ b/contrib/relent-filter/src/Main.cpp
@@ -42,6 +42,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "RelativeEntropyCalc.h"
#include "LexicalReordering.h"
#include "LexicalReorderingState.h"
+#include "util/random.hh"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
@@ -205,7 +206,7 @@ int main(int argc, char** argv)
//initialise random numbers
- srand(time(NULL));
+ rand_init();
// set up read/writing class
IOWrapper* ioWrapper = GetIOWrapper(staticData);
diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index befebd8d2..edf7daa13 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -536,7 +536,7 @@ public:
{
// should the score breakdown be reported in a more structured manner?
ostringstream buf;
- path.GetScoreBreakdown().OutputAllFeatureScores(buf);
+ path.GetScoreBreakdown()->OutputAllFeatureScores(buf);
nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
}
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 49c1239e5..98f6c8399 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -17,6 +17,7 @@
#include "util/exception.hh"
#include "util/file_piece.hh"
+#include "util/random.hh"
#include "util/tokenize_piece.hh"
#include "util/string_piece.hh"
#include "FeatureDataIterator.h"
@@ -286,7 +287,7 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
} else {
//create shards by randomly sampling
for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
- shard_contents.push_back(rand() % data_size);
+ shard_contents.push_back(util::rand_excl(data_size));
}
}
diff --git a/mert/Fdstream.h b/mert/Fdstream.h
index 2258ef4a5..23eecc466 100644
--- a/mert/Fdstream.h
+++ b/mert/Fdstream.h
@@ -13,6 +13,8 @@
#include <iostream>
#include <string>
+#include "util/unistd.hh"
+
#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
#include <ext/stdio_filebuf.h>
diff --git a/mert/FileStream.cpp b/mert/FileStream.cpp
index 800ce1bfe..3d908de4f 100644
--- a/mert/FileStream.cpp
+++ b/mert/FileStream.cpp
@@ -40,28 +40,3 @@ inputfilestream::~inputfilestream()
void inputfilestream::close()
{
}
-
-outputfilestream::outputfilestream(const std::string &filePath)
- : std::ostream(0), m_streambuf(0), m_is_good(false)
-{
- // check if file is readable
- std::filebuf* fb = new std::filebuf();
- m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
-
- if (IsGzipFile(filePath)) {
- throw runtime_error("Output to a zipped file not supported!");
- } else {
- m_streambuf = fb;
- }
- this->init(m_streambuf);
-}
-
-outputfilestream::~outputfilestream()
-{
- delete m_streambuf;
- m_streambuf = 0;
-}
-
-void outputfilestream::close()
-{
-}
diff --git a/mert/FileStream.h b/mert/FileStream.h
index 582cbcb59..8cbf4f591 100644
--- a/mert/FileStream.h
+++ b/mert/FileStream.h
@@ -22,20 +22,4 @@ public:
void close();
};
-class outputfilestream : public std::ostream
-{
-protected:
- std::streambuf *m_streambuf;
- bool m_is_good;
-
-public:
- explicit outputfilestream(const std::string &filePath);
- virtual ~outputfilestream();
-
- bool good() const {
- return m_is_good;
- }
- void close();
-};
-
#endif // MERT_FILE_STREAM_H_
diff --git a/mert/ForestRescoreTest.cpp b/mert/ForestRescoreTest.cpp
index 4b62e8317..23668ab20 100644
--- a/mert/ForestRescoreTest.cpp
+++ b/mert/ForestRescoreTest.cpp
@@ -1,6 +1,9 @@
#include <iostream>
+#include "util/tokenize_piece.hh"
+
#include "ForestRescore.h"
+#include "MiraFeatureVector.h"
#define BOOST_TEST_MODULE MertForestRescore
#include <boost/test/unit_test.hpp>
@@ -10,8 +13,7 @@
using namespace std;
using namespace MosesTuning;
-BOOST_AUTO_TEST_CASE(viterbi_simple_lattice)
-{
+BOOST_AUTO_TEST_CASE(viterbi_simple_lattice) {
Vocab vocab;
WordVec words;
string wordStrings[] =
@@ -242,5 +244,101 @@ BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice)
BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[8]);
}
+BOOST_AUTO_TEST_CASE(viterbi_full_hypergraph) {
+ Vocab vocab;
+ //References
+ ReferenceSet references;
+ references.AddLine(0,"in addition to EU support for businesses , also the administration of national business support will be concentrated in four Centres for Economic Development , Transport and Environment ( ELY Centres ) , starting from mid @-@ September .",vocab);
+ //Load the hypergraph
+ Graph graph(vocab);
+ util::scoped_fd fd(util::OpenReadOrThrow("mert/hgtest/0.gz"));
+ util::FilePiece file(fd.release());
+ ReadGraph(file,graph);
+
+ //prune
+ SparseVector weights;
+ weights.set("OpSequenceModel0_1",0.011187);
+ weights.set("OpSequenceModel0_2",-0.002797);
+ weights.set("OpSequenceModel0_3",0.002797);
+ weights.set("OpSequenceModel0_4",-0.000140);
+ weights.set("OpSequenceModel0_5",0.004195);
+ weights.set("Distortion0",0.041952);
+ weights.set("PhrasePenalty0",0.027968);
+ weights.set("WordPenalty0",-0.139841);
+ weights.set("UnknownWordPenalty0",1.000000);
+ weights.set("LM0",0.069920);
+ weights.set("LexicalReordering0_1",0.041952);
+ weights.set("LexicalReordering0_2",0.041952);
+ weights.set("LexicalReordering0_3",0.041952);
+ weights.set("LexicalReordering0_4",0.041952);
+ weights.set("LexicalReordering0_5",0.041952);
+ weights.set("LexicalReordering0_6",0.041952);
+ weights.set("LexicalReordering0_7",0.041952);
+ weights.set("LexicalReordering0_8",0.041952);
+ weights.set("TranslationModel0_1",0.027968);
+ weights.set("TranslationModel0_2",0.027968);
+ weights.set("TranslationModel0_3",0.027968);
+ weights.set("TranslationModel0_4",0.027968);
+ weights.set("TranslationModel0_5",0.027968);
+ weights.set("TranslationModel0_6",0.027968);
+ weights.set("TranslationModel0_7",0.027968);
+ weights.set("TranslationModel0_8",0.027968);
+ weights.set("TranslationModel0_9",0.027968);
+ weights.set("TranslationModel0_10",0.027968);
+ weights.set("TranslationModel0_11",0.027968);
+ weights.set("TranslationModel0_12",0.027968);
+ weights.set("TranslationModel0_13",0.027968);
+ size_t edgeCount = 500;
+ boost::shared_ptr<Graph> prunedGraph;
+ prunedGraph.reset(new Graph(vocab));
+ graph.Prune(prunedGraph.get(), weights, edgeCount);
+
+ vector<ValType> bg(9);
+ HgHypothesis bestHypo;
+ //best hypothesis
+ Viterbi(*prunedGraph, weights, 0, references, 0, bg, &bestHypo);
+ //check output as expected
+ string expectedStr = "<s> the EU matters , but also the national matters management focus since mid @-@ September four ely @-@ centre . </s>";
+ util::TokenIter<util::SingleCharacter, true> expected(expectedStr, util::SingleCharacter(' '));
+ for (size_t i = 0; i < bestHypo.text.size(); ++i) {
+ //cerr << bestHypo.text[i]->first << " ";
+ BOOST_CHECK_EQUAL(*expected,bestHypo.text[i]->first);
+ ++expected;
+ }
+ BOOST_CHECK(!expected);
+ //cerr << endl;
+ //check scores
+ BOOST_CHECK_CLOSE(-80.062,bestHypo.featureVector.get("OpSequenceModel0_1"), 0.001);
+ BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_2"), 0.001);
+ BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_3"), 0.001);
+ BOOST_CHECK_CLOSE(3,bestHypo.featureVector.get("OpSequenceModel0_4"), 0.001);
+ BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("OpSequenceModel0_5"), 0.001);
+ BOOST_CHECK_CLOSE(-6,bestHypo.featureVector.get("Distortion0"), 0.001);
+ BOOST_CHECK_CLOSE(14,bestHypo.featureVector.get("PhrasePenalty0"), 0.001);
+ BOOST_CHECK_CLOSE(-20,bestHypo.featureVector.get("WordPenalty0"), 0.001);
+ BOOST_CHECK_CLOSE(-100,bestHypo.featureVector.get("UnknownWordPenalty0"), 0.001);
+ BOOST_CHECK_CLOSE(-126.616,bestHypo.featureVector.get("LM0"), 0.001);
+ BOOST_CHECK_CLOSE(-5.2238,bestHypo.featureVector.get("LexicalReordering0_1"), 0.001);
+ BOOST_CHECK_CLOSE(-0.29515,bestHypo.featureVector.get("LexicalReordering0_2"), 0.001);
+ BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_3"), 0.001);
+ BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_4"), 0.001);
+ BOOST_CHECK_CLOSE(-9.28267,bestHypo.featureVector.get("LexicalReordering0_5"), 0.001);
+ BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_6"), 0.001);
+ BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_7"), 0.001);
+ BOOST_CHECK_CLOSE(-0.402678,bestHypo.featureVector.get("LexicalReordering0_8"), 0.001);
+ BOOST_CHECK_CLOSE(-54.3119,bestHypo.featureVector.get("TranslationModel0_1"), 0.001);
+ BOOST_CHECK_CLOSE(-62.2619,bestHypo.featureVector.get("TranslationModel0_2"), 0.001);
+ BOOST_CHECK_CLOSE(-23.8782,bestHypo.featureVector.get("TranslationModel0_3"), 0.001);
+ BOOST_CHECK_CLOSE(-25.1626,bestHypo.featureVector.get("TranslationModel0_4"), 0.001);
+ BOOST_CHECK_CLOSE(12.9986,bestHypo.featureVector.get("TranslationModel0_5"), 0.001);
+ BOOST_CHECK_CLOSE(3.99959,bestHypo.featureVector.get("TranslationModel0_6"), 0.001);
+ BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_7"), 0.001);
+ BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_8"), 0.001);
+ BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_9"), 0.001);
+ BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_10"), 0.001);
+ BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_11"), 0.001);
+ BOOST_CHECK_CLOSE(0.999896,bestHypo.featureVector.get("TranslationModel0_12"), 0.001);
+ BOOST_CHECK_CLOSE(7.99917,bestHypo.featureVector.get("TranslationModel0_13"), 0.001);
+}
diff --git a/mert/MeteorScorer.cpp b/mert/MeteorScorer.cpp
index 1254ec95f..f4c7997ee 100644
--- a/mert/MeteorScorer.cpp
+++ b/mert/MeteorScorer.cpp
@@ -18,6 +18,7 @@
#include "ScoreStats.h"
#include "Util.h"
+#include "util/unistd.hh"
using namespace std;
@@ -25,7 +26,7 @@ namespace MosesTuning
{
// Meteor supported
-#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#if (defined(__GLIBCXX__) || defined(__GLIBCPP__)) && !defined(_WIN32)
// for clarity
#define CHILD_STDIN_READ pipefds_input[0]
diff --git a/mert/Point.cpp b/mert/Point.cpp
index 55dc6a6b2..681d3ab3e 100644
--- a/mert/Point.cpp
+++ b/mert/Point.cpp
@@ -3,6 +3,7 @@
#include <cmath>
#include <cstdlib>
#include "util/exception.hh"
+#include "util/random.hh"
#include "FeatureStats.h"
#include "Optimizer.h"
@@ -57,10 +58,8 @@ void Point::Randomize()
UTIL_THROW_IF(m_min.size() != Point::m_dim, util::Exception, "Error");
UTIL_THROW_IF(m_max.size() != Point::m_dim, util::Exception, "Error");
- for (unsigned int i = 0; i < size(); i++) {
- operator[](i) = m_min[i] +
- static_cast<float>(random()) / static_cast<float>(RAND_MAX) * (m_max[i] - m_min[i]);
- }
+ for (unsigned int i = 0; i < size(); i++)
+ operator[](i) = util::rand_incl(m_min[i], m_max[i]);
}
double Point::operator*(const FeatureStats& F) const
diff --git a/mert/TODO b/mert/TODO
index 21b4ce04e..4ceb628d3 100644
--- a/mert/TODO
+++ b/mert/TODO
@@ -5,11 +5,8 @@
- check that --pairwise-ranked is compatible with all optimization metrics
-- Replace the standard rand() currently used in MERT and PRO with better
- random generators such as Boost's random generators (e.g., boost::mt19937).
- - create a Random class to hide the details, i.e., how to generate
- random numbers, which allows us to use custom random generators more
- easily.
+- Use better random generators in util/random.cc, e.g. boost::mt19937.
+ - Support plugging of custom random generators.
Pros:
- In MERT, you might want to use the random restarting technique to avoid
diff --git a/mert/TimerTest.cpp b/mert/TimerTest.cpp
index d72b1c312..532e44fc1 100644
--- a/mert/TimerTest.cpp
+++ b/mert/TimerTest.cpp
@@ -11,7 +11,20 @@ using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(timer_basic_test)
{
Timer timer;
- const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests.
+
+ // Sleep time. The test will sleep for this number of microseconds, and
+ // expect the elapsed time to be noticeable.
+ // Keep this number low to avoid wasting test time sleeping, but at least as
+ // high as the Boost timer's resolution. Tests must pass consistently, not
+ // just on lucky runs.
+#if defined(WIN32)
+ // Timer resolution on Windows seems to be a millisecond. Anything less and
+ // the test fails consistently.
+ const int sleep_time_microsec = 1000;
+#else
+ // Unix-like systems seem to have more fine-grained clocks.
+ const int sleep_time_microsec = 40;
+#endif
timer.start();
BOOST_REQUIRE(timer.is_running());
diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp
index 25da9adbc..59ffaf3cd 100644
--- a/mert/evaluator.cpp
+++ b/mert/evaluator.cpp
@@ -1,3 +1,4 @@
+#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
@@ -15,6 +16,7 @@
#include "Timer.h"
#include "Util.h"
#include "Data.h"
+#include "util/random.hh"
using namespace std;
using namespace MosesTuning;
@@ -91,17 +93,15 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i
if (bootstrap) {
vector<float> scores;
for (int i = 0; i < bootstrap; ++i) {
- // TODO: Use smart pointer for exceptional-safety.
- ScoreData* scoredata = new ScoreData(g_scorer);
+ ScoreData scoredata(g_scorer);
for (int j = 0; j < n; ++j) {
- int randomIndex = random() % n;
- scoredata->add(entries[randomIndex], j);
+ const int randomIndex = util::rand_excl(n);
+ scoredata.add(entries[randomIndex], j);
}
- g_scorer->setScoreData(scoredata);
+ g_scorer->setScoreData(&scoredata);
candidates_t candidates(n, 0);
float score = g_scorer->score(candidates);
scores.push_back(score);
- delete scoredata;
}
float avg = average(scores);
@@ -121,15 +121,13 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i
cout.precision(4);
cout << avg << "\t[" << lb << "," << rb << "]" << endl;
} else {
- // TODO: Use smart pointer for exceptional-safety.
- ScoreData* scoredata = new ScoreData(g_scorer);
+ ScoreData scoredata(g_scorer);
for (int sid = 0; sid < n; ++sid) {
- scoredata->add(entries[sid], sid);
+ scoredata.add(entries[sid], sid);
}
- g_scorer->setScoreData(scoredata);
+ g_scorer->setScoreData(&scoredata);
candidates_t candidates(n, 0);
float score = g_scorer->score(candidates);
- delete scoredata;
if (g_has_more_files) cout << candFile << "\t";
if (g_has_more_scorers) cout << g_scorer->getName() << "\t";
@@ -287,10 +285,10 @@ void InitSeed(const ProgramOption *opt)
{
if (opt->has_seed) {
cerr << "Seeding random numbers with " << opt->seed << endl;
- srandom(opt->seed);
+ util::rand_init(opt->seed);
} else {
cerr << "Seeding random numbers with system clock " << endl;
- srandom(time(NULL));
+ util::rand_init();
}
}
diff --git a/mert/hgtest/0.gz b/mert/hgtest/0.gz
new file mode 100644
index 000000000..012f9efbe
--- /dev/null
+++ b/mert/hgtest/0.gz
Binary files differ
diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp
index 5a119e875..092176984 100644
--- a/mert/kbmira.cpp
+++ b/mert/kbmira.cpp
@@ -40,6 +40,7 @@ de recherches du Canada
#include <boost/scoped_ptr.hpp>
#include "util/exception.hh"
+#include "util/random.hh"
#include "BleuScorer.h"
#include "HopeFearDecoder.h"
@@ -122,10 +123,10 @@ int main(int argc, char** argv)
if (vm.count("random-seed")) {
cerr << "Initialising random seed to " << seed << endl;
- srand(seed);
+ util::rand_init(seed);
} else {
cerr << "Initialising random seed from system clock" << endl;
- srand(time(NULL));
+ util::rand_init();
}
// Initialize weights
diff --git a/mert/mert.cpp b/mert/mert.cpp
index 275aa7b09..82b4cc34d 100644
--- a/mert/mert.cpp
+++ b/mert/mert.cpp
@@ -24,6 +24,7 @@
#include "Types.h"
#include "Timer.h"
#include "Util.h"
+#include "util/random.hh"
#include "moses/ThreadPool.h"
@@ -289,10 +290,10 @@ int main(int argc, char **argv)
if (option.has_seed) {
cerr << "Seeding random numbers with " << option.seed << endl;
- srandom(option.seed);
+ util::rand_init(option.seed);
} else {
cerr << "Seeding random numbers with system clock " << endl;
- srandom(time(NULL));
+ util::rand_init();
}
if (option.sparse_weights_file.size()) ++option.pdim;
diff --git a/mert/pro.cpp b/mert/pro.cpp
index 7660fe7d0..c0f9f7b57 100644
--- a/mert/pro.cpp
+++ b/mert/pro.cpp
@@ -43,6 +43,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ScoreDataIterator.h"
#include "BleuScorer.h"
#include "Util.h"
+#include "util/random.hh"
using namespace std;
using namespace MosesTuning;
@@ -141,10 +142,10 @@ int main(int argc, char** argv)
if (vm.count("random-seed")) {
cerr << "Initialising random seed to " << seed << endl;
- srand(seed);
+ util::rand_init(seed);
} else {
cerr << "Initialising random seed from system clock" << endl;
- srand(time(NULL));
+ util::rand_init();
}
if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
@@ -211,11 +212,11 @@ int main(int argc, char** argv)
vector<float> scores;
size_t n_translations = hypotheses.size();
for(size_t i=0; i<n_candidates; i++) {
- size_t rand1 = rand() % n_translations;
+ size_t rand1 = util::rand_excl(n_translations);
pair<size_t,size_t> translation1 = hypotheses[rand1];
float bleu1 = smoothedSentenceBleu(scoreDataIters[translation1.first]->operator[](translation1.second), bleuSmoothing, smoothBP);
- size_t rand2 = rand() % n_translations;
+ size_t rand2 = util::rand_excl(n_translations);
pair<size_t,size_t> translation2 = hypotheses[rand2];
float bleu2 = smoothedSentenceBleu(scoreDataIters[translation2.first]->operator[](translation2.second), bleuSmoothing, smoothBP);
diff --git a/moses-cmd/MainVW.cpp b/moses-cmd/MainVW.cpp
index c55b0fe2c..302866733 100644
--- a/moses-cmd/MainVW.cpp
+++ b/moses-cmd/MainVW.cpp
@@ -45,6 +45,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/StatelessFeatureFunction.h"
#include "moses/TrainingTask.h"
+#include "util/random.hh"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
@@ -117,7 +118,7 @@ int main(int argc, char** argv)
//initialise random numbers
- srand(time(NULL));
+ util::rand_init();
// set up read/writing class
IFVERBOSE(1) {
diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp
index b6a5ec255..3a64ac8ac 100644
--- a/moses/ExportInterface.cpp
+++ b/moses/ExportInterface.cpp
@@ -27,6 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <sstream>
#include <vector>
+#include "util/random.hh"
#include "util/usage.hh"
#ifdef WIN32
@@ -91,7 +92,7 @@ SimpleTranslationInterface::SimpleTranslationInterface(const string &mosesIni):
exit(1);
}
- srand(time(NULL));
+ util::rand_init();
}
@@ -185,7 +186,7 @@ batch_run()
const StaticData& staticData = StaticData::Instance();
//initialise random numbers
- srand(time(NULL));
+ util::rand_init();
IFVERBOSE(1) PrintUserTime("Created input-output object");
diff --git a/moses/FF/LexicalReordering/SparseReordering.cpp b/moses/FF/LexicalReordering/SparseReordering.cpp
index 040b94988..27e090ccd 100644
--- a/moses/FF/LexicalReordering/SparseReordering.cpp
+++ b/moses/FF/LexicalReordering/SparseReordering.cpp
@@ -13,8 +13,11 @@
#include "LexicalReordering.h"
#include "SparseReordering.h"
+#include <boost/algorithm/string/predicate.hpp>
+
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -57,6 +60,7 @@ const std::string& SparseReorderingFeatureKey::Name (const string& wordListId)
SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer)
: m_producer(producer)
+ , m_useWeightMap(false)
{
static const string kSource= "source";
static const string kTarget = "target";
@@ -80,6 +84,14 @@ SparseReordering::SparseReordering(const map<string,string>& config, const Lexic
} else {
UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
}
+ } else if (fields[0] == "weights") {
+ ReadWeightMap(i->second);
+ m_useWeightMap = true;
+ for (int reoType=0; reoType<=LRModel::MAX; ++reoType) {
+ ostringstream buf;
+ buf << reoType;
+ m_featureMap2.push_back(m_producer->GetFeatureName(buf.str()));
+ }
} else if (fields[0] == "phrase") {
m_usePhrase = true;
@@ -175,7 +187,16 @@ void SparseReordering::AddFeatures(
SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType);
FeatureMap::const_iterator fmi = m_featureMap.find(key);
assert(fmi != m_featureMap.end());
- scores->SparsePlusEquals(fmi->second, 1.0);
+ if (m_useWeightMap) {
+ WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
+ if (wmi != m_weightMap.end()) {
+ if (wmi->second != 0) {
+ scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
+ }
+ }
+ } else {
+ scores->SparsePlusEquals(fmi->second, 1.0);
+ }
}
for (size_t id = 0; id < clusterMaps->size(); ++id) {
@@ -186,7 +207,16 @@ void SparseReordering::AddFeatures(
SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType);
FeatureMap::const_iterator fmi = m_featureMap.find(key);
assert(fmi != m_featureMap.end());
- scores->SparsePlusEquals(fmi->second, 1.0);
+ if (m_useWeightMap) {
+ WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
+ if (wmi != m_weightMap.end()) {
+ if (wmi->second != 0) {
+ scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
+ }
+ }
+ } else {
+ scores->SparsePlusEquals(fmi->second, 1.0);
+ }
}
}
@@ -256,5 +286,29 @@ void SparseReordering::CopyScores(
}
+
+void SparseReordering::ReadWeightMap(const string& filename)
+{
+ util::FilePiece file(filename.c_str());
+ StringPiece line;
+ while (true) {
+ try {
+ line = file.ReadLine();
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
+ util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter(' '));
+ UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
+ const std::string& name = lineIter->as_string();
+ ++lineIter;
+ UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
+ float weight = Moses::Scan<float>(lineIter->as_string());
+
+ std::pair< WeightMap::iterator, bool> inserted = m_weightMap.insert( std::make_pair(name, weight) );
+ UTIL_THROW_IF2(!inserted.second, "Duplicate weight: '" << name << "'");
+ }
+}
+
+
} //namespace
diff --git a/moses/FF/LexicalReordering/SparseReordering.h b/moses/FF/LexicalReordering/SparseReordering.h
index 8a2495ce8..958ce998b 100644
--- a/moses/FF/LexicalReordering/SparseReordering.h
+++ b/moses/FF/LexicalReordering/SparseReordering.h
@@ -112,10 +112,16 @@ private:
typedef boost::unordered_map<SparseReorderingFeatureKey, FName, HashSparseReorderingFeatureKey, EqualsSparseReorderingFeatureKey> FeatureMap;
FeatureMap m_featureMap;
+ typedef boost::unordered_map<std::string, float> WeightMap;
+ WeightMap m_weightMap;
+ bool m_useWeightMap;
+ std::vector<FName> m_featureMap2;
+
void ReadWordList(const std::string& filename, const std::string& id,
SparseReorderingFeatureKey::Side side, std::vector<WordList>* pWordLists);
void ReadClusterMap(const std::string& filename, const std::string& id, SparseReorderingFeatureKey::Side side, std::vector<ClusterMap>* pClusterMaps);
void PreCalculateFeatureNames(size_t index, const std::string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster);
+ void ReadWeightMap(const std::string& filename);
void AddFeatures(
SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
diff --git a/moses/FF/VW/VW.h b/moses/FF/VW/VW.h
index 6bdb1416c..dd9d0b858 100644
--- a/moses/FF/VW/VW.h
+++ b/moses/FF/VW/VW.h
@@ -86,6 +86,10 @@ struct VWTargetSentence {
int src = it->first;
int tgt = it->second;
+ if (src >= m_sourceConstraints.size() || tgt >= m_targetConstraints.size()) {
+ UTIL_THROW2("VW :: alignment point out of bounds: " << src << "-" << tgt);
+ }
+
m_sourceConstraints[src].Update(tgt);
m_targetConstraints[tgt].Update(src);
}
diff --git a/moses/HypergraphOutput.cpp b/moses/HypergraphOutput.cpp
index 47c564882..6b353a83b 100644
--- a/moses/HypergraphOutput.cpp
+++ b/moses/HypergraphOutput.cpp
@@ -98,6 +98,7 @@ HypergraphOutput<M>::HypergraphOutput(size_t precision) :
// If this line gives you compile errors,
// contact Lane Schwartz on the Moses mailing list
m_hypergraphDir = nbestPath.parent_path().string();
+ if (m_hypergraphDir.empty()) m_hypergraphDir=".";
} else {
stringstream hypergraphDirName;
diff --git a/moses/LM/Remote.cpp b/moses/LM/Remote.cpp
index af02a6617..33946442a 100644
--- a/moses/LM/Remote.cpp
+++ b/moses/LM/Remote.cpp
@@ -1,14 +1,15 @@
#include <cstdio>
#include <cstdlib>
+#include <cstring>
#include <unistd.h>
#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <netdb.h>
#include "Remote.h"
#include "moses/Factor.h"
+#if !defined(_WIN32) && !defined(_WIN64)
+#include <arpa/inet.h>
+#endif
+
namespace Moses
{
@@ -41,12 +42,16 @@ bool LanguageModelRemote::start(const std::string& host, int port)
sock = socket(AF_INET, SOCK_STREAM, 0);
hp = gethostbyname(host.c_str());
if (hp==NULL) {
+#if defined(_WIN32) || defined(_WIN64)
+ fprintf(stderr, "gethostbyname failed\n");
+#else
herror("gethostbyname failed");
+#endif
exit(1);
}
- bzero((char *)&server, sizeof(server));
- bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
+ memset(&server, '\0', sizeof(server));
+ memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
server.sin_family = hp->h_addrtype;
server.sin_port = htons(port);
diff --git a/moses/LM/Remote.h b/moses/LM/Remote.h
index d50e3e9b4..b7a72d853 100644
--- a/moses/LM/Remote.h
+++ b/moses/LM/Remote.h
@@ -4,9 +4,15 @@
#include "SingleFactor.h"
#include "moses/TypeDef.h"
#include "moses/Factor.h"
-#include <sys/socket.h>
#include <sys/types.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <winsock2.h>
+#else
+#include <sys/socket.h>
#include <netinet/in.h>
+#include <netdb.h>
+#endif
namespace Moses
{
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index cb91a9d29..a936fa7c7 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -55,6 +55,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#endif
#include "util/exception.hh"
+#include "util/random.hh"
using namespace std;
@@ -426,7 +427,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
//cerr << endl;
//draw the sample
- float frandom = log((float)rand()/RAND_MAX);
+ const float frandom = log(util::rand_incl(0.0f, 1.0f));
size_t position = 1;
float sum = candidateScores[0];
for (; position < candidateScores.size() && sum < frandom; ++position) {
@@ -1645,7 +1646,7 @@ void Manager::OutputNBest(std::ostream& out
out << " |||";
// print scores with feature names
- path.GetScoreBreakdown().OutputAllFeatureScores(out );
+ path.GetScoreBreakdown()->OutputAllFeatureScores(out);
// total
out << " ||| " << path.GetTotalScore();
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 98ed1f439..3c21a6725 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -31,6 +31,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "InputFileStream.h"
#include "StaticData.h"
#include "util/exception.hh"
+#include "util/random.hh"
#include <boost/program_options.hpp>
@@ -1393,7 +1394,7 @@ struct Credit {
this->contact = contact ;
this->currentPursuits = currentPursuits ;
this->areaResponsibility = areaResponsibility;
- this->sortId = rand() % 1000;
+ this->sortId = util::rand_excl(1000);
}
bool operator<(const Credit &other) const {
diff --git a/moses/Syntax/F2S/HyperTreeLoader.cpp b/moses/Syntax/F2S/HyperTreeLoader.cpp
index f3caa2cec..bd19cbace 100644
--- a/moses/Syntax/F2S/HyperTreeLoader.cpp
+++ b/moses/Syntax/F2S/HyperTreeLoader.cpp
@@ -40,12 +40,12 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
const std::string &inFile,
const RuleTableFF &ff,
- HyperTree &trie)
+ HyperTree &trie,
+ boost::unordered_set<std::size_t> &sourceTermSet)
{
PrintUserTime(std::string("Start loading HyperTree"));
- // const StaticData &staticData = StaticData::Instance();
- // const std::string &factorDelimiter = staticData.GetFactorDelimiter();
+ sourceTermSet.clear();
std::size_t count = 0;
@@ -106,6 +106,7 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
// Source-side
HyperPath sourceFragment;
hyperPathLoader.Load(sourceString, sourceFragment);
+ ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);
// Target-side
TargetPhrase *targetPhrase = new TargetPhrase(&ff);
@@ -144,6 +145,23 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
return true;
}
+void HyperTreeLoader::ExtractSourceTerminalSetFromHyperPath(
+ const HyperPath &hp, boost::unordered_set<std::size_t> &sourceTerminalSet)
+{
+ for (std::vector<HyperPath::NodeSeq>::const_iterator p = hp.nodeSeqs.begin();
+ p != hp.nodeSeqs.end(); ++p) {
+ for (std::vector<std::size_t>::const_iterator q = p->begin();
+ q != p->end(); ++q) {
+ const std::size_t factorId = *q;
+ if (factorId >= moses_MaxNumNonterminals &&
+ factorId != HyperPath::kComma &&
+ factorId != HyperPath::kEpsilon) {
+ sourceTerminalSet.insert(factorId);
+ }
+ }
+ }
+}
+
} // namespace F2S
} // namespace Syntax
} // namespace Moses
diff --git a/moses/Syntax/F2S/HyperTreeLoader.h b/moses/Syntax/F2S/HyperTreeLoader.h
index ea009022d..088c7eaf5 100644
--- a/moses/Syntax/F2S/HyperTreeLoader.h
+++ b/moses/Syntax/F2S/HyperTreeLoader.h
@@ -3,9 +3,12 @@
#include <istream>
#include <vector>
+#include <boost/unordered_set.hpp>
+
#include "moses/TypeDef.h"
#include "moses/Syntax/RuleTableFF.h"
+#include "HyperPath.h"
#include "HyperTree.h"
#include "HyperTreeCreator.h"
@@ -23,7 +26,12 @@ public:
const std::vector<FactorType> &output,
const std::string &inFile,
const RuleTableFF &,
- HyperTree &);
+ HyperTree &,
+ boost::unordered_set<std::size_t> &);
+
+private:
+ void ExtractSourceTerminalSetFromHyperPath(
+ const HyperPath &, boost::unordered_set<std::size_t> &);
};
} // namespace F2S
diff --git a/moses/Syntax/F2S/Manager-inl.h b/moses/Syntax/F2S/Manager-inl.h
index 6c289440c..3aedc640e 100644
--- a/moses/Syntax/F2S/Manager-inl.h
+++ b/moses/Syntax/F2S/Manager-inl.h
@@ -39,6 +39,7 @@ Manager<RuleMatcher>::Manager(ttasksptr const& ttask)
if (const ForestInput *p = dynamic_cast<const ForestInput*>(&m_source)) {
m_forest = p->GetForest();
m_rootVertex = p->GetRootVertex();
+ m_sentenceLength = p->GetSize();
} else if (const TreeInput *p = dynamic_cast<const TreeInput*>(&m_source)) {
T2S::InputTreeBuilder builder;
T2S::InputTree tmpTree;
@@ -46,6 +47,7 @@ Manager<RuleMatcher>::Manager(ttasksptr const& ttask)
boost::shared_ptr<Forest> forest = boost::make_shared<Forest>();
m_rootVertex = T2S::InputTreeToForest(tmpTree, *forest);
m_forest = forest;
+ m_sentenceLength = p->GetSize();
} else {
UTIL_THROW2("ERROR: F2S::Manager requires input to be a tree or forest");
}
@@ -83,8 +85,13 @@ void Manager<RuleMatcher>::Decode()
p = sortedVertices.begin(); p != sortedVertices.end(); ++p) {
const Forest::Vertex &vertex = **p;
- // Skip terminal vertices.
+ // Skip terminal vertices (after checking if they are OOVs).
if (vertex.incoming.empty()) {
+ if (vertex.pvertex.span.GetStartPos() > 0 &&
+ vertex.pvertex.span.GetEndPos() < m_sentenceLength-1 &&
+ IsUnknownSourceWord(vertex.pvertex.symbol)) {
+ m_oovs.insert(vertex.pvertex.symbol);
+ }
continue;
}
@@ -190,6 +197,21 @@ void Manager<RuleMatcher>::InitializeStacks()
}
}
+template<typename RuleMatcher>
+bool Manager<RuleMatcher>::IsUnknownSourceWord(const Word &w) const
+{
+ const std::size_t factorId = w[0]->GetId();
+ const std::vector<RuleTableFF*> &ffs = RuleTableFF::Instances();
+ for (std::size_t i = 0; i < ffs.size(); ++i) {
+ RuleTableFF *ff = ffs[i];
+ const boost::unordered_set<std::size_t> &sourceTerms =
+ ff->GetSourceTerminalSet();
+ if (sourceTerms.find(factorId) != sourceTerms.end()) {
+ return false;
+ }
+ }
+ return true;
+}
template<typename RuleMatcher>
const SHyperedge *Manager<RuleMatcher>::GetBestSHyperedge() const
diff --git a/moses/Syntax/F2S/Manager.h b/moses/Syntax/F2S/Manager.h
index 53f4cff13..44128ad65 100644
--- a/moses/Syntax/F2S/Manager.h
+++ b/moses/Syntax/F2S/Manager.h
@@ -50,10 +50,13 @@ private:
void InitializeStacks();
+ bool IsUnknownSourceWord(const Word &) const;
+
void RecombineAndSort(const std::vector<SHyperedge*> &, SVertexStack &);
boost::shared_ptr<const Forest> m_forest;
const Forest::Vertex *m_rootVertex;
+ std::size_t m_sentenceLength; // Includes <s> and </s>
PVertexToStackMap m_stackMap;
boost::shared_ptr<HyperTree> m_glueRuleTrie;
std::vector<boost::shared_ptr<RuleMatcher> > m_mainRuleMatchers;
diff --git a/moses/Syntax/RuleTableFF.cpp b/moses/Syntax/RuleTableFF.cpp
index f4e06f489..37063e048 100644
--- a/moses/Syntax/RuleTableFF.cpp
+++ b/moses/Syntax/RuleTableFF.cpp
@@ -35,7 +35,8 @@ void RuleTableFF::Load()
staticData.GetSearchAlgorithm() == SyntaxT2S) {
F2S::HyperTree *trie = new F2S::HyperTree(this);
F2S::HyperTreeLoader loader;
- loader.Load(m_input, m_output, m_filePath, *this, *trie);
+ loader.Load(m_input, m_output, m_filePath, *this, *trie,
+ m_sourceTerminalSet);
m_table = trie;
} else if (staticData.GetSearchAlgorithm() == SyntaxS2T) {
S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
diff --git a/moses/Syntax/RuleTableFF.h b/moses/Syntax/RuleTableFF.h
index 4d6132e86..25e7d8428 100644
--- a/moses/Syntax/RuleTableFF.h
+++ b/moses/Syntax/RuleTableFF.h
@@ -43,10 +43,17 @@ public:
return 0;
}
+ // Get the source terminal vocabulary for this table's grammar (as a set of
+ // factor IDs)
+ const boost::unordered_set<std::size_t> &GetSourceTerminalSet() const {
+ return m_sourceTerminalSet;
+ }
+
private:
static std::vector<RuleTableFF*> s_instances;
const RuleTable *m_table;
+ boost::unordered_set<std::size_t> m_sourceTerminalSet;
};
} // Syntax
diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h
index bf08574ff..389b60359 100644
--- a/moses/TranslationModel/CompactPT/MmapAllocator.h
+++ b/moses/TranslationModel/CompactPT/MmapAllocator.h
@@ -24,14 +24,18 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <limits>
#include <iostream>
-#include <sys/mman.h>
#include <cstdio>
#include <unistd.h>
-#ifndef __MMAN_PAGE_SIZE__
-#define __MMAN_PAGE_SIZE__ sysconf(_SC_PAGE_SIZE)
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#include <io.h>
+#else
+#include <sys/mman.h>
#endif
+#include "util/mmap.hh"
+
namespace Moses
{
template <class T>
@@ -60,25 +64,25 @@ public:
MmapAllocator() throw()
: m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
- m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
+ m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
}
MmapAllocator(std::FILE* f_ptr) throw()
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
- m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
+ m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
}
MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
- m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
+ m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) {
}
MmapAllocator(std::string fileName) throw()
: m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
- m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
+ m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
}
@@ -92,7 +96,7 @@ public:
~MmapAllocator() throw() {
if(m_data_ptr && *m_count == 0) {
- munmap(m_data_ptr, m_map_size);
+ util::UnmapOrThrow(m_data_ptr, m_map_size);
if(!m_fixed && std::ftell(m_file_ptr) != -1)
std::fclose(m_file_ptr);
}
@@ -119,13 +123,17 @@ public:
pointer allocate (size_type num, const void* = 0) {
m_map_size = num * sizeof(T);
+#if defined(_WIN32) || defined(_WIN64)
+ // On Windows, MAP_SHARED is not defined and MapOrThrow ignores the flags.
+ const int map_shared = 0;
+#else
+ const int map_shared = MAP_SHARED;
+#endif
if(!m_fixed) {
size_t read = 0;
read += ftruncate(m_file_desc, m_map_size);
- m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED,
- m_file_desc, 0);
- if(m_data_ptr == MAP_FAILED)
- std::cerr << "Error: mmapping" << std::endl;
+ m_data_ptr = (char *)util::MapOrThrow(
+ m_map_size, true, map_shared, false, m_file_desc, 0);
return (pointer)m_data_ptr;
} else {
size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
@@ -133,8 +141,8 @@ public:
size_t map_size = m_map_size + relative_offset;
- m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED,
- m_file_desc, map_offset);
+ m_data_ptr = (char *)util::MapOrThrow(
+ m_map_size, false, map_shared, false, m_file_desc, map_offset);
return (pointer)(m_data_ptr + relative_offset);
}
@@ -142,11 +150,11 @@ public:
void deallocate (pointer p, size_type num) {
if(!m_fixed) {
- munmap(p, num * sizeof(T));
+ util::UnmapOrThrow(p, num * sizeof(T));
} else {
size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
size_t relative_offset = m_data_offset - map_offset;
- munmap((pointer)((char*)p - relative_offset), num * sizeof(T));
+ util::UnmapOrThrow((pointer)((char*)p - relative_offset), num * sizeof(T));
}
}
diff --git a/moses/TranslationModel/DynSAInclude/FileHandler.cpp b/moses/TranslationModel/DynSAInclude/FileHandler.cpp
index 9413ffd7c..ecde3c644 100644
--- a/moses/TranslationModel/DynSAInclude/FileHandler.cpp
+++ b/moses/TranslationModel/DynSAInclude/FileHandler.cpp
@@ -1,7 +1,9 @@
#include "FileHandler.h"
#include <cstdio>
-#ifdef WIN32
+// Workaround: plain Windows does not have popen()/pclose().
+// (MinGW already #define's them, so skip the workaround there.)
+#if defined(WIN32) && !defined(__MINGW32__)
#define popen(A, B) _popen(A, B)
#define pclose(A) _pclose(A)
#endif
diff --git a/moses/TranslationModel/DynSAInclude/hash.h b/moses/TranslationModel/DynSAInclude/hash.h
index 8536c46f5..4cf69bf2f 100644
--- a/moses/TranslationModel/DynSAInclude/hash.h
+++ b/moses/TranslationModel/DynSAInclude/hash.h
@@ -6,6 +6,7 @@
#include "utils.h"
#include "FileHandler.h"
#include "util/exception.hh"
+#include "util/random.hh"
using namespace Moses;
typedef uint64_t P; // largest input range is 2^64
@@ -162,7 +163,7 @@ void Hash_shiftAddXOR<T>::initSeeds()
{
v_ = new T[this->H_];
for(count_t i=0; i < this->H_; i++)
- v_[i] = Utils::rand<T>() + 1;
+ v_[i] = util::wide_rand<T>() + 1;
}
template <typename T>
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h)
@@ -187,9 +188,8 @@ void UnivHash_tableXOR<T>::initSeeds()
// fill with random values
for(count_t j=0; j < this->H_; j++) {
table_[j] = new T[tblLen_];
- for(count_t i=0; i < tblLen_; i++) {
- table_[j][i] = Utils::rand<T>(this->m_-1);
- }
+ for(count_t i=0; i < tblLen_; i++)
+ table_[j][i] = util::wide_rand_excl(this->m_-1);
}
}
template <typename T>
@@ -218,7 +218,7 @@ void UnivHash_noPrimes<T>::initSeeds()
{
a_ = new P[this->H_];
for(T i=0; i < this->H_; i++) {
- a_[i] = Utils::rand<P>();
+ a_[i] = util::wide_rand<P>();
if(a_[i] % 2 == 0) a_[i]++; // a must be odd
}
}
@@ -284,8 +284,8 @@ void UnivHash_linear<T>::initSeeds()
a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
- a_[i][j] = 1 + Utils::rand<T>();
- b_[i][j] = Utils::rand<T>();
+ a_[i][j] = 1 + util::wide_rand<T>();
+ b_[i][j] = util::wide_rand<T>();
}
}
}
diff --git a/moses/TranslationModel/DynSAInclude/onlineRLM.h b/moses/TranslationModel/DynSAInclude/onlineRLM.h
index 1d3f66eac..050e016c9 100644
--- a/moses/TranslationModel/DynSAInclude/onlineRLM.h
+++ b/moses/TranslationModel/DynSAInclude/onlineRLM.h
@@ -302,7 +302,8 @@ float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
}
while(num_fnd > 1) { // get lower order count
//get sub-context of size one less than length found (exluding target)
- if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
+ den_val = query(&ngram[len - num_fnd], num_fnd - 1);
+ if((den_val > 0) &&
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
break;
} else --num_fnd; // else backoff to lower ngram order
diff --git a/moses/TranslationModel/DynSAInclude/utils.h b/moses/TranslationModel/DynSAInclude/utils.h
index e2f24fd4f..485e4a065 100644
--- a/moses/TranslationModel/DynSAInclude/utils.h
+++ b/moses/TranslationModel/DynSAInclude/utils.h
@@ -62,22 +62,6 @@ public:
str[i] = tolower(str[i]);
}
}
- // TODO: interface with decent PRG
- template<typename T>
- static T rand(T mod_bnd = 0) {
- T random = 0;
- if(sizeof(T) <= 4) {
- random = static_cast<T>(std::rand());
- } else if(sizeof(T) == 8) {
- random = static_cast<T>(std::rand());
- random <<= 31;
- random <<= 1;
- random |= static_cast<T>(std::rand());
- }
- if(mod_bnd != 0)
- return random % mod_bnd;
- else return random;
- }
};
#endif
diff --git a/moses/TranslationModel/DynSuffixArray.cpp b/moses/TranslationModel/DynSuffixArray.cpp
index 3e8c79c0e..c1dc62f12 100644
--- a/moses/TranslationModel/DynSuffixArray.cpp
+++ b/moses/TranslationModel/DynSuffixArray.cpp
@@ -1,4 +1,6 @@
#include "DynSuffixArray.h"
+#include "util/random.hh"
+
#include <iostream>
#include <boost/foreach.hpp>
@@ -315,33 +317,31 @@ int DynSuffixArray::Compare(int pos1, int pos2, int max)
return 0;
}
+namespace
+{
+/// Helper: swap two entries in an int array.
+inline void swap_ints(int array[], int one, int other)
+{
+ const int tmp = array[one];
+ array[one] = array[other];
+ array[other] = tmp;
+}
+}
+
void DynSuffixArray::Qsort(int* array, int begin, int end)
{
if(end > begin) {
- int index;
+ int index = util::rand_incl(begin, end);
{
- index = begin + (rand() % (end - begin + 1));
- int pivot = array[index];
- {
- int tmp = array[index];
- array[index] = array[end];
- array[end] = tmp;
- }
+ const int pivot = array[index];
+ swap_ints(array, index, end);
for(int i=index=begin; i < end; ++i) {
if (Compare(array[i], pivot, 20) <= 0) {
- {
- int tmp = array[index];
- array[index] = array[i];
- array[i] = tmp;
- index++;
- }
+ swap_ints(array, index, i);
+ index++;
}
}
- {
- int tmp = array[index];
- array[index] = array[end];
- array[end] = tmp;
- }
+ swap_ints(array, index, end);
}
Qsort(array, begin, index - 1);
Qsort(array, index + 1, end);
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
index c632f9ff2..773e027cc 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
@@ -17,6 +17,7 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "util/exception.hh"
+#include "util/tokenize.hh"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
using namespace std;
@@ -30,29 +31,6 @@ void OutputVec(const vector<T> &vec)
cerr << endl;
}
-// from phrase-extract/tables-core.cpp
-inline vector<string> tokenize( const char* input )
-{
- vector< string > token;
- bool betweenWords = true;
- int start=0;
- int i=0;
- for(; input[i] != '\0'; i++) {
- bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
- if (!isSpace && betweenWords) {
- start = i;
- betweenWords = false;
- } else if (isSpace && !betweenWords) {
- token.push_back( string( input+start, i-start ) );
- betweenWords = true;
- }
- }
- if (!betweenWords)
- token.push_back( string( input+start, i-start ) );
- return token;
-}
-
namespace Moses
{
@@ -464,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
i++;
if (i%100000 == 0) cerr << "." << flush;
- vector<string> token = tokenize( line.c_str() );
+ const vector<string> token = util::tokenize( line );
if (token.size() != 4) {
cerr << "line " << i << " in " << fileName
<< " has wrong number of tokens, skipping:\n"
diff --git a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
index 69b7e9f5f..1d654f4b0 100644
--- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
@@ -1,11 +1,11 @@
// vim:tabstop=2
#include <cstdlib>
-#include <boost/filesystem.hpp>
#include "PhraseDictionaryTransliteration.h"
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
#include "moses/DecodeGraph.h"
#include "moses/DecodeStep.h"
+#include "util/tempfile.hh"
using namespace std;
@@ -70,11 +70,10 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
inputPath.SetTargetPhrases(*this, tpColl, NULL);
} else {
// TRANSLITERATE
- const boost::filesystem::path
- inFile = boost::filesystem::unique_path(),
- outDir = boost::filesystem::unique_path();
+ const util::temp_file inFile;
+ const util::temp_dir outDir;
- ofstream inStream(inFile.c_str());
+ ofstream inStream(inFile.path().c_str());
inStream << sourcePhrase.ToString() << endl;
inStream.close();
@@ -84,14 +83,14 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
" --external-bin-dir " + m_externalDir +
" --input-extension " + m_inputLang +
" --output-extension " + m_outputLang +
- " --oov-file " + inFile.native() +
- " --out-dir " + outDir.native();
+ " --oov-file " + inFile.path() +
+ " --out-dir " + outDir.path();
int ret = system(cmd.c_str());
UTIL_THROW_IF2(ret != 0, "Transliteration script error");
TargetPhraseCollection *tpColl = new TargetPhraseCollection();
- vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.native());
+ vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.path());
vector<TargetPhrase*>::const_iterator iter;
for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
TargetPhrase *tp = *iter;
@@ -102,10 +101,6 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
cache[hash] = value;
inputPath.SetTargetPhrases(*this, tpColl, NULL);
-
- // clean up temporary files
- remove(inFile.c_str());
- boost::filesystem::remove_all(outDir);
}
}
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
index 1ca9dce67..9135b7e73 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
@@ -45,6 +45,7 @@
#include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h"
#include "util/file.hh"
#include "util/exception.hh"
+#include "util/random.hh"
using namespace std;
@@ -62,8 +63,8 @@ char *mkdtemp(char *tempbuf)
return NULL;
}
- srand((unsigned)time(0));
- rand_value = (int)((rand() / ((double)RAND_MAX+1.0)) * 1e6);
+ util::rand_init();
+ rand_value = util::rand_excl(1e6);
tempbase = strrchr(tempbuf, '/');
tempbase = tempbase ? tempbase+1 : tempbuf;
strcpy(tempbasebuf, tempbase);
@@ -130,10 +131,6 @@ int removedirectoryrecursively(const char *dirname)
struct dirent *entry;
char path[PATH_MAX];
- if (path == NULL) {
- fprintf(stderr, "Out of memory error\n");
- return 0;
- }
dir = opendir(dirname);
if (dir == NULL) {
perror("Error opendir()");
diff --git a/moses/TranslationModel/UG/generic/sampling/Sampling.h b/moses/TranslationModel/UG/generic/sampling/Sampling.h
index c60953d5d..652e532bc 100644
--- a/moses/TranslationModel/UG/generic/sampling/Sampling.h
+++ b/moses/TranslationModel/UG/generic/sampling/Sampling.h
@@ -2,19 +2,16 @@
#define __sampling_h
#include <boost/dynamic_bitset.hpp>
#include <vector>
+
+#include "util/random.hh"
+
// Utility functions for proper sub-sampling.
// (c) 2007-2012 Ulrich Germann
namespace Moses
{
- using namespace std;
-inline
-size_t
-randInt(size_t N)
-{
- return N*(rand()/(RAND_MAX+1.));
-}
+using namespace std;
// select a random sample of size /s/ without restitution from the range of
// integers [0,N);
@@ -35,15 +32,15 @@ randomSample(vector<idx_t>& v, size_t s, size_t N)
if (s*10<N) {
boost::dynamic_bitset<uint64_t> check(N,0);
for (size_t i = 0; i < v.size(); i++) {
- size_t x = randInt(N);
- while (check[x]) x = randInt(N);
+ size_t x = util::rand_excl(N);
+ while (check[x]) x = util::rand_excl(N);
check[x]=true;
v[i] = x;
}
} else {
size_t m=0;
for (size_t t = 0; m <= s && t < N; t++)
- if (s==N || randInt(N-t) < s-m) v[m++] = t;
+ if (s==N || util::rand_excl(N-t) < s-m) v[m++] = t;
}
}
diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.cc b/moses/TranslationModel/UG/mm/ug_mmbitext.cc
index 8f1a4aa12..2c00665bb 100644
--- a/moses/TranslationModel/UG/mm/ug_mmbitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_mmbitext.cc
@@ -345,7 +345,7 @@
// {
// boost::lock_guard<boost::mutex> lock(stats->lock);
// if (stats->raw_cnt == ctr) ++stats->raw_cnt;
-// size_t rnum = randInt(stats->raw_cnt - ctr++);
+// size_t rnum = util::rand_excl(stats->raw_cnt - ctr++);
// // cout << stats->raw_cnt << " " << ctr-1 << " "
// // << rnum << " " << max_samples - stats->good << endl;
// if (rnum < max_samples - stats->good)
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h
index fc4b9f0ad..034a74bd9 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h
@@ -69,7 +69,7 @@ namespace ugdiss
// while (chosen < samplesize && next < stop)
// {
// root->readEntry(next,*this);
- // if (randInt(N - sampled++) < samplesize - chosen)
+ // if (util::rand_excl(N - sampled++) < samplesize - chosen)
// {
// ++chosen;
// return true;
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
index 096739fe9..508f09304 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@@ -9,6 +9,7 @@
#include <iostream>
#include "util/exception.hh"
#include "moses/Util.h"
+#include "util/random.hh"
//#include <cassert>
// #include "ug_bv_iter.h"
@@ -896,13 +897,6 @@ namespace ugdiss
return bv;
}
- inline
- size_t
- randInt(size_t N)
- {
- return size_t(N*(rand()/(RAND_MAX+1.)));
- }
-
/// randomly select up to N occurrences of the sequence
template<typename Token>
sptr<vector<typename ttrack::Position> >
@@ -924,8 +918,8 @@ namespace ugdiss
root->readEntry(I.next,I);
// t: expected number of remaining samples
- double t = (stop - I.pos)/root->aveIndexEntrySize();
- double r = t*rand()/(RAND_MAX+1.);
+ const double t = (stop - I.pos)/root->aveIndexEntrySize();
+ const double r = util::rand_excl(t);
if (r < N-m)
{
ret->at(m).offset = I.offset;
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index af1053438..83b3db6a3 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -16,7 +16,7 @@ namespace Moses
{
using namespace bitext;
using namespace std;
- // using namespace boost;
+ using namespace boost;
void
fillIdSeq(Phrase const& mophrase, size_t const ifactor,
@@ -155,6 +155,10 @@ namespace Moses
input_factor = atoi(param.insert(dflt).first->second.c_str());
// shouldn't that be a string?
+ dflt = pair<string,string> ("output-factor","0");
+ output_factor = atoi(param.insert(dflt).first->second.c_str());
+ ofactor.assign(1,output_factor);
+
dflt = pair<string,string> ("smooth",".01");
m_lbop_conf = atof(param.insert(dflt).first->second.c_str());
diff --git a/moses/TrellisPath.cpp b/moses/TrellisPath.cpp
index e76adc2db..36397e006 100644
--- a/moses/TrellisPath.cpp
+++ b/moses/TrellisPath.cpp
@@ -31,7 +31,6 @@ namespace Moses
TrellisPath::TrellisPath(const Hypothesis *hypo)
: m_prevEdgeChanged(NOT_FOUND)
{
- m_scoreBreakdown = hypo->GetScoreBreakdown();
m_totalScore = hypo->GetTotalScore();
// enumerate path using prevHypo
@@ -41,10 +40,9 @@ TrellisPath::TrellisPath(const Hypothesis *hypo)
}
}
-void TrellisPath::InitScore()
+void TrellisPath::InitTotalScore()
{
m_totalScore = m_path[0]->GetWinningHypo()->GetTotalScore();
- m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown();
//calc score
size_t sizePath = m_path.size();
@@ -53,12 +51,8 @@ void TrellisPath::InitScore()
const Hypothesis *winningHypo = hypo->GetWinningHypo();
if (hypo != winningHypo) {
m_totalScore = m_totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
- m_scoreBreakdown.MinusEquals(winningHypo->GetScoreBreakdown());
- m_scoreBreakdown.PlusEquals(hypo->GetScoreBreakdown());
}
}
-
-
}
TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypothesis *arc)
@@ -80,7 +74,7 @@ TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypoth
prevHypo = prevHypo->GetPrevHypo();
}
- InitScore();
+ InitTotalScore();
}
TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
@@ -88,9 +82,7 @@ TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
{
m_path.resize(edges.size());
copy(edges.rbegin(),edges.rend(),m_path.begin());
- InitScore();
-
-
+ InitTotalScore();
}
@@ -172,6 +164,32 @@ void TrellisPath::CreateDeviantPaths(TrellisPathList &pathColl) const
}
}
+const boost::shared_ptr<ScoreComponentCollection> TrellisPath::GetScoreBreakdown() const
+{
+ if (!m_scoreBreakdown) {
+ float totalScore = m_path[0]->GetWinningHypo()->GetTotalScore(); // calculated for sanity check only
+
+ m_scoreBreakdown = boost::shared_ptr<ScoreComponentCollection>(new ScoreComponentCollection());
+ m_scoreBreakdown->PlusEquals(ScoreComponentCollection(m_path[0]->GetWinningHypo()->GetScoreBreakdown()));
+
+ //calc score
+ size_t sizePath = m_path.size();
+ for (size_t pos = 0 ; pos < sizePath ; pos++) {
+ const Hypothesis *hypo = m_path[pos];
+ const Hypothesis *winningHypo = hypo->GetWinningHypo();
+ if (hypo != winningHypo) {
+ totalScore = totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
+ m_scoreBreakdown->MinusEquals(winningHypo->GetScoreBreakdown());
+ m_scoreBreakdown->PlusEquals(hypo->GetScoreBreakdown());
+ }
+ }
+
+ assert(totalScore == m_totalScore);
+ }
+
+ return m_scoreBreakdown;
+}
+
Phrase TrellisPath::GetTargetPhrase() const
{
Phrase targetPhrase(ARRAY_SIZE_INCR);
diff --git a/moses/TrellisPath.h b/moses/TrellisPath.h
index def86549b..89efb32e4 100644
--- a/moses/TrellisPath.h
+++ b/moses/TrellisPath.h
@@ -19,14 +19,14 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#ifndef moses_TrellisPath_h
-#define moses_TrellisPath_h
+#pragma once
#include <iostream>
#include <vector>
#include <limits>
#include "Hypothesis.h"
#include "TypeDef.h"
+#include <boost/shared_ptr.hpp>
namespace Moses
{
@@ -50,13 +50,13 @@ protected:
, or NOT_FOUND if this path is the best trans so consist of only hypos
*/
- ScoreComponentCollection m_scoreBreakdown;
float m_totalScore;
+ mutable boost::shared_ptr<ScoreComponentCollection> m_scoreBreakdown;
//Used by Manager::LatticeSample()
explicit TrellisPath(const std::vector<const Hypothesis*> edges);
- void InitScore();
+ void InitTotalScore();
public:
TrellisPath(); // not implemented
@@ -91,9 +91,7 @@ public:
//! create a list of next best paths by wiggling 1 of the node at a time.
void CreateDeviantPaths(TrellisPathList &pathColl) const;
- inline const ScoreComponentCollection &GetScoreBreakdown() const {
- return m_scoreBreakdown;
- }
+ const boost::shared_ptr<ScoreComponentCollection> GetScoreBreakdown() const;
//! get target words range of the hypo within n-best trellis. not necessarily the same as hypo.GetCurrTargetWordsRange()
WordsRange GetTargetWordsRange(const Hypothesis &hypo) const;
@@ -123,4 +121,4 @@ inline std::ostream& operator<<(std::ostream& out, const TrellisPath& path)
}
}
-#endif
+
diff --git a/moses/Util.h b/moses/Util.h
index 48e6a51ae..68989721c 100644
--- a/moses/Util.h
+++ b/moses/Util.h
@@ -502,13 +502,11 @@ inline std::string GetFirstString(const std::string& str, int& first_pos, const
template<class T>
T log_sum (T log_a, T log_b)
{
- T v;
if (log_a < log_b) {
- v = log_b+log ( 1 + exp ( log_a-log_b ));
+ return log_b + log1p(exp(log_a - log_b));
} else {
- v = log_a+log ( 1 + exp ( log_b-log_a ));
+ return log_a + log1p(exp(log_b - log_a));
}
- return ( v );
}
/**
diff --git a/moses/mbr.cpp b/moses/mbr.cpp
index df2313b66..66dac47f7 100644
--- a/moses/mbr.cpp
+++ b/moses/mbr.cpp
@@ -105,13 +105,13 @@ const TrellisPath doMBR(const TrellisPathList& nBestList)
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
const TrellisPath &path = **iter;
float score = StaticData::Instance().GetMBRScale()
- * path.GetScoreBreakdown().GetWeightedScore();
+ * path.GetScoreBreakdown()->GetWeightedScore();
if (maxScore < score) maxScore = score;
}
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
const TrellisPath &path = **iter;
- joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore() - maxScore);
+ joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown()->GetWeightedScore() - maxScore);
marginal += joint_prob;
joint_prob_vec.push_back(joint_prob);
diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp
index 02558fa84..aab8867b5 100644
--- a/moses/server/TranslationRequest.cpp
+++ b/moses/server/TranslationRequest.cpp
@@ -166,7 +166,7 @@ namespace MosesServer
{
// should the score breakdown be reported in a more structured manner?
ostringstream buf;
- path->GetScoreBreakdown().OutputAllFeatureScores(buf);
+ path->GetScoreBreakdown()->OutputAllFeatureScores(buf);
nBestXmlItem["fvals"] = xmlrpc_c::value_string(buf.str());
}
diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp
index 899eb9f1c..d5138ba9b 100644
--- a/phrase-extract/DomainFeature.cpp
+++ b/phrase-extract/DomainFeature.cpp
@@ -2,6 +2,7 @@
#include "ExtractionPhrasePair.h"
#include "tables-core.h"
#include "InputFileStream.h"
+#include "util/tokenize.hh"
using namespace std;
@@ -17,7 +18,7 @@ void Domain::load( const std::string &domainFileName )
string line;
while(getline(*fileP, line)) {
// read
- vector< string > domainSpecLine = tokenize( line.c_str() );
+ const vector< string > domainSpecLine = util::tokenize( line );
int lineNumber;
if (domainSpecLine.size() != 2 ||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
@@ -25,7 +26,7 @@ void Domain::load( const std::string &domainFileName )
exit(1);
}
// store
- string &name = domainSpecLine[1];
+ const string &name = domainSpecLine[1];
spec.push_back( make_pair( lineNumber, name ));
if (name2id.find( name ) == name2id.end()) {
name2id[ name ] = list.size();
diff --git a/phrase-extract/DomainFeature.h b/phrase-extract/DomainFeature.h
index 040a5fc72..95babb6c2 100644
--- a/phrase-extract/DomainFeature.h
+++ b/phrase-extract/DomainFeature.h
@@ -14,8 +14,6 @@
#include "ScoreFeature.h"
-extern std::vector<std::string> tokenize( const char*);
-
namespace MosesTraining
{
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index ee7f27ed9..21c1a1dbd 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -24,6 +24,7 @@
#include <string>
#include "tables-core.h"
+#include "util/tokenize.hh"
using namespace std;
@@ -40,7 +41,7 @@ void addBoundaryWords(vector<string> &phrase)
bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
{
- target = tokenize(targetString);
+ target = util::tokenize(targetString);
if (boundaryRules)
addBoundaryWords(target);
return true;
@@ -48,7 +49,7 @@ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bo
bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
{
- source = tokenize(sourceString);
+ source = util::tokenize(sourceString);
if (boundaryRules)
addBoundaryWords(source);
return true;
@@ -89,7 +90,7 @@ bool SentenceAlignment::create(const char targetString[],
}
// reading in alignments
- vector<string> alignmentSequence = tokenize( alignmentString );
+ vector<string> alignmentSequence = util::tokenize( alignmentString );
for(size_t i=0; i<alignmentSequence.size(); i++) {
int s,t;
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp
index 1b4ed7c88..4fd2355ae 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@@ -26,6 +26,7 @@
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
+#include "util/tokenize.hh"
using namespace std;
@@ -49,7 +50,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
<< sentenceID << ": " << e.getMsg() << std::endl;
return false;
}
- target = tokenize(targetStringCPP.c_str());
+ target = util::tokenize(targetStringCPP);
return true;
}
@@ -70,11 +71,8 @@ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceStrin
<< sentenceID << ": " << e.getMsg() << std::endl;
return false;
}
- source = tokenize(sourceStringCPP.c_str());
+ source = util::tokenize(sourceStringCPP);
return true;
}
} // namespace
-
-
-
diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp
index 423a3909b..d25197372 100644
--- a/phrase-extract/consolidate-direct-main.cpp
+++ b/phrase-extract/consolidate-direct-main.cpp
@@ -25,11 +25,10 @@
#include <cstdlib>
#include "InputFileStream.h"
#include "OutputFileStream.h"
+#include "util/tokenize.hh"
using namespace std;
-std::vector<std::string> tokenize( const char [] );
-
vector< string > splitLine(const char *line)
{
vector< string > item;
@@ -109,7 +108,7 @@ int main(int argc, char* argv[])
if (! getLine(fileDirectP, itemDirect ))
break;
- vector< string > count = tokenize( itemDirect[4].c_str() );
+ const vector< string > count = util::tokenize( itemDirect[4] );
float countEF = atof(count[0].c_str());
float countF = atof(count[1].c_str());
float prob = countF/countEF;
diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp
index e2b0ad473..bce496a0c 100644
--- a/phrase-extract/consolidate-reverse-main.cpp
+++ b/phrase-extract/consolidate-reverse-main.cpp
@@ -28,6 +28,7 @@
#include "tables-core.h"
#include "InputFileStream.h"
+#include "util/tokenize.hh"
using namespace std;
@@ -165,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
// counts, for debugging
- vector<string> directCounts = tokenize(itemDirect[4].c_str());
- vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+ const vector<string> directCounts = util::tokenize(itemDirect[4]);
+ const vector<string> indirectCounts = util::tokenize(itemIndirect[4]);
fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
// output rule count if present in either file
if (indirectCounts.size() > 1) {
@@ -199,7 +200,6 @@ bool getLine( istream &fileP, vector< string > &item )
vector< string > splitLine(const char *line)
{
vector< string > item;
- bool betweenWords = true;
int start=0;
int i=0;
for(; line[i] != '\0'; i++) {
@@ -223,10 +223,10 @@ string reverseAlignment(const string &alignments)
{
stringstream ret("");
- vector<string> alignToks = tokenize(alignments.c_str());
+ const vector<string> alignToks = util::tokenize(alignments);
for (size_t i = 0; i < alignToks.size(); ++i) {
- string &alignPair = alignToks[i];
+ const string &alignPair = alignToks[i];
vector<string> alignPoints;
Tokenize(alignPoints, alignPair, "-");
assert(alignPoints.size() == 2);
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index 2f28c3244..f9800c8e0 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -23,6 +23,7 @@
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
+#include "util/tokenize.hh"
#include <cassert>
#include <vector>
@@ -56,7 +57,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
m_tree.ConnectNodes();
SyntaxNode *root = m_tree.GetTop();
assert(root);
- m_words = tokenize(m_line.c_str());
+ m_words = util::tokenize(m_line);
return ConvertTree(*root, m_words);
}
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc
index 3d9291994..29e46a9f2 100644
--- a/phrase-extract/pcfg-common/xml_tree_parser.cc
+++ b/phrase-extract/pcfg-common/xml_tree_parser.cc
@@ -25,6 +25,7 @@
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
+#include "util/tokenize.hh"
#include "syntax-common/exception.h"
@@ -51,7 +52,7 @@ std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
// There is no XML tree.
return std::auto_ptr<PcfgTree>();
}
- m_words = tokenize(m_line.c_str());
+ m_words = util::tokenize(m_line);
return ConvertTree(*root, m_words);
}
diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index a6d50cef5..5c9daa7ae 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -21,6 +21,7 @@
#include "relax-parse.h"
#include "tables-core.h"
+#include "util/tokenize.hh"
using namespace std;
using namespace MosesTraining;
@@ -44,7 +45,7 @@ int main(int argc, char* argv[])
map< string, int > topLabelCollection; // count of top labels, not used
SyntaxTree tree;
ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
- vector< string > inWords = tokenize( inBufferString.c_str() );
+ const vector< string > inWords = util::tokenize( inBufferString );
// output tree
// cerr << "BEFORE:" << endl << tree;
@@ -104,7 +105,7 @@ void init(int argc, char* argv[])
}
}
-void store( SyntaxTree &tree, vector< string > &words )
+void store( SyntaxTree &tree, const vector< string > &words )
{
// output words
for( size_t i=0; i<words.size(); i++ ) {
diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h
index ec604405e..9bd0bfb23 100644
--- a/phrase-extract/relax-parse.h
+++ b/phrase-extract/relax-parse.h
@@ -39,7 +39,7 @@ char SAMTLevel = 0;
// functions
void init(int argc, char* argv[]);
-void store( MosesTraining::SyntaxTree &tree, std::vector<std::string> &words );
+void store( MosesTraining::SyntaxTree &tree, const std::vector<std::string> &words );
void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp
index a6c0b74db..840f18602 100644
--- a/phrase-extract/statistics-main.cpp
+++ b/phrase-extract/statistics-main.cpp
@@ -14,6 +14,7 @@
#include "AlignmentPhrase.h"
#include "tables-core.h"
#include "InputFileStream.h"
+#include "util/tokenize.hh"
using namespace std;
using namespace MosesTraining;
@@ -237,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
bool PhraseAlignment::create(const char line[], int lineID )
{
- vector< string > token = tokenize( line );
+ const vector< string > token = util::tokenize( line );
int item = 1;
PHRASE phraseF, phraseE;
for (size_t j=0; j<token.size(); j++) {
@@ -321,7 +322,7 @@ void LexicalTable::load( const string &filePath )
i++;
if (i%100000 == 0) cerr << "." << flush;
- vector<string> token = tokenize( line.c_str() );
+ const vector<string> token = util::tokenize( line );
if (token.size() != 3) {
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
token.size() << " " << token[0] << " " << line << endl;
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index c4363a3e2..c6e3cd3c3 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -3,6 +3,7 @@
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
+#include "util/tokenize.hh"
#include <cassert>
#include <vector>
@@ -24,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) {
tree_.ConnectNodes();
SyntaxNode *root = tree_.GetTop();
assert(root);
- words_ = tokenize(line_.c_str());
+ words_ = util::tokenize(line_);
return ConvertTree(*root, words_);
}
diff --git a/phrase-extract/tables-core.cpp b/phrase-extract/tables-core.cpp
index 30c1544e9..4dd8e704a 100644
--- a/phrase-extract/tables-core.cpp
+++ b/phrase-extract/tables-core.cpp
@@ -1,5 +1,6 @@
// $Id$
//#include "beammain.h"
+#include "util/tokenize.hh"
#include "tables-core.h"
#define TABLE_LINE_MAX_LENGTH 1000
@@ -7,37 +8,9 @@
using namespace std;
-// as in beamdecoder/tables.cpp
-vector<string> tokenize( const char* input )
-{
- vector< string > token;
- bool betweenWords = true;
- int start=0;
- int i=0;
- for(; input[i] != '\0'; i++) {
- bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
- if (!isSpace && betweenWords) {
- start = i;
- betweenWords = false;
- } else if (isSpace && !betweenWords) {
- token.push_back( string( input+start, i-start ) );
- betweenWords = true;
- }
- }
- if (!betweenWords)
- token.push_back( string( input+start, i-start ) );
- return token;
-}
-
namespace MosesTraining
{
-bool isNonTerminal( const WORD &symbol )
-{
- return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
-}
-
WORD_ID Vocabulary::storeIfNew( const WORD& word )
{
map<WORD, WORD_ID>::iterator i = lookup.find( word );
@@ -107,7 +80,7 @@ void DTable::load( const string& fileName )
abort();
}
- vector<string> token = tokenize(line.c_str());
+ const vector<string> token = util::tokenize(line);
if (token.size() < 2) {
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
continue;
diff --git a/phrase-extract/tables-core.h b/phrase-extract/tables-core.h
index 44545d3a0..011fe09e6 100644
--- a/phrase-extract/tables-core.h
+++ b/phrase-extract/tables-core.h
@@ -12,8 +12,6 @@
#include <map>
#include <cmath>
-extern std::vector<std::string> tokenize( const char*);
-
namespace MosesTraining
{
diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl
index 27ecfe342..e2b604f0b 100755
--- a/scripts/OSM/OSM-Train.perl
+++ b/scripts/OSM/OSM-Train.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/OSM/extract-singletons.perl b/scripts/OSM/extract-singletons.perl
index faa4e8dd6..83719502f 100755
--- a/scripts/OSM/extract-singletons.perl
+++ b/scripts/OSM/extract-singletons.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use Getopt::Std;
getopts('q');
diff --git a/scripts/OSM/flipAlignment.perl b/scripts/OSM/flipAlignment.perl
index 3c14a4542..3559bf79b 100755
--- a/scripts/OSM/flipAlignment.perl
+++ b/scripts/OSM/flipAlignment.perl
@@ -1,5 +1,7 @@
#!/usr/bin/env perl
- use strict;
+
+use warnings;
+use strict;
my $file = shift(@ARGV);
open(MYFILE, $file);
diff --git a/scripts/Transliteration/clean.pl b/scripts/Transliteration/clean.pl
index 252a25075..c59bf0798 100755
--- a/scripts/Transliteration/clean.pl
+++ b/scripts/Transliteration/clean.pl
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
#input hindi word urdu word, delete all those entries that have number on any side
+use warnings;
use utf8;
use Getopt::Std;
diff --git a/scripts/Transliteration/corpusCreator.pl b/scripts/Transliteration/corpusCreator.pl
index 8c8dab863..d2df8323c 100755
--- a/scripts/Transliteration/corpusCreator.pl
+++ b/scripts/Transliteration/corpusCreator.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl
index e4e8b41e3..216d99a3e 100755
--- a/scripts/Transliteration/in-decoding-transliteration.pl
+++ b/scripts/Transliteration/in-decoding-transliteration.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl
index 7e6f249ae..201f40d97 100755
--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
index 565a98297..4fc03b526 100755
--- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl
+++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/Transliteration/threshold.pl b/scripts/Transliteration/threshold.pl
index 8af699821..8e3704fd6 100755
--- a/scripts/Transliteration/threshold.pl
+++ b/scripts/Transliteration/threshold.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use utf8;
require Encode;
use IO::Handle;
diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl
index 54c2ccf78..05804afb6 100755
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use utf8;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
index b74aa003d..149676b6f 100755
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@@ -14,6 +14,7 @@ use utf8;
# 23.01.2010: added NIST p-value and interval computation
###############################################
+use warnings;
use strict;
#constants
diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl
index c8bc367b2..4f6560a56 100755
--- a/scripts/analysis/sentence-by-sentence.pl
+++ b/scripts/analysis/sentence-by-sentence.pl
@@ -4,6 +4,7 @@
#sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors
#usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.html
+use warnings;
use strict;
use Getopt::Long;
diff --git a/scripts/analysis/sg2dot.perl b/scripts/analysis/sg2dot.perl
index a165cf25e..b17dfd9fb 100755
--- a/scripts/analysis/sg2dot.perl
+++ b/scripts/analysis/sg2dot.perl
@@ -4,6 +4,7 @@
# Script to convert MOSES searchgraph to DOT format
#
+use warnings;
use strict;
use File::Path;
use File::Basename;
diff --git a/scripts/analysis/show-phrases-used.pl b/scripts/analysis/show-phrases-used.pl
index c31e930d5..0a719d207 100755
--- a/scripts/analysis/show-phrases-used.pl
+++ b/scripts/analysis/show-phrases-used.pl
@@ -5,7 +5,9 @@
#usage: show-phrases-used DECODER_OUTFILE > output.html
# where DECODER_OUTFILE is the output of moses with the -T (show alignments) option
+use warnings;
use strict;
+
BEGIN
{
my $wd= `pawd 2>/dev/null`;
diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl
index ed09d0b3f..9f411f3fa 100755
--- a/scripts/analysis/smtgui/filter-phrase-table.pl
+++ b/scripts/analysis/smtgui/filter-phrase-table.pl
@@ -9,6 +9,7 @@
#similar function to filter-model-given-input.pl, but only operates
#on the phrase table and doesn't require that any subdirectories exist
+use warnings;
use strict;
my $MAX_LENGTH = 10;
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index bc0a3b6b9..57ef4f9d6 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -7,8 +7,15 @@ get-corpus
default-name: corpus/txt
rerun-on-change: input-extension output-extension
template: IN OUT $input-extension $output-extension
+pre-tok-clean
+ in: raw-stem
+ out: pre-tok-cleaned
+ default-name: corpus/pre-tok-cleaned
+ pass-unless: pre-tok-clean
+ template: $pre-tok-clean IN $input-extension $output-extension OUT OUT.lines-retained
+ parallelizable: yes
tokenize
- in: raw-stem
+ in: pre-tok-cleaned
out: tokenized-stem
default-name: corpus/tok
pass-unless: input-tokenizer output-tokenizer
@@ -158,11 +165,18 @@ get-corpus
pass-unless: get-corpus-script
default-name: lm/txt
template: $get-corpus-script > OUT
+use-parallel-corpus
+ in: parallel-corpus-stem
+ out: tokenized-corpus
+ default-name: lm/tok
+ ignore-unless: parallel-corpus-stem
+ template: ln -s IN.$output-extension OUT
tokenize
in: raw-corpus
out: tokenized-corpus
default-name: lm/tok
pass-unless: output-tokenizer
+ ignore-if: parallel-corpus-stem
template: $output-tokenizer < IN > OUT
parallelizable: yes
mock-parse
@@ -204,8 +218,14 @@ split
default-name: lm/split
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
+strip
+ in: split-corpus
+ out: stripped-corpus
+ default-name: lm/stripped
+ pass-unless: mock-output-parser-lm
+ template: $moses-script-dir/training/strip-xml.perl < IN > OUT
train
- in: split-corpus
+ in: stripped-corpus
out: lm
default-name: lm/lm
ignore-if: rlm-training
@@ -220,7 +240,7 @@ randomize
pass-unless: lm-randomizer
ignore-if: rlm-training
train-randomized
- in: split-corpus
+ in: stripped-corpus
out: rlm
default-name: lm/rlm
ignore-unless: rlm-training
@@ -953,21 +973,21 @@ split-reference-devtest
ignore-unless: use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
-reduce-reference
+strip-reference
in: split-ref
out: reference
- default-name: tuning/reference.reduced
+ default-name: tuning/reference.stripped
pass-unless: mock-output-parser-references
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
- template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
-reduce-reference-devtest
+ template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
+strip-reference-devtest
in: split-ref-devtest
out: reference
- default-name: tuning/reference.devtest.reduced
+ default-name: tuning/reference.devtest.stripped
pass-unless: mock-output-parser-references
ignore-unless: use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
- template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
+ template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
filter
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
out: filtered-dir
@@ -1224,13 +1244,13 @@ lowercase-reference
pass-if: recaser
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
-reduce-reference
+strip-reference
in: lowercased-reference
out: reference
default-name: evaluation/reference
pass-unless: mock-output-parser-references
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
- template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
+ template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
wade
in: filtered-dir truecased-input tokenized-reference alignment system-output
out: wade-analysis
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index a7ce88622..7070a7c9e 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -3,6 +3,7 @@
# Experiment Management System
# Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/ems/fix-info.perl b/scripts/ems/fix-info.perl
index 98139f211..8f83d4ccf 100755
--- a/scripts/ems/fix-info.perl
+++ b/scripts/ems/fix-info.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($file,$step) = @ARGV;
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index 8df3d6551..cea2657c9 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/ems/support/build-domain-file-from-subcorpora.perl b/scripts/ems/support/build-domain-file-from-subcorpora.perl
index 683ef1ed7..f166c8927 100755
--- a/scripts/ems/support/build-domain-file-from-subcorpora.perl
+++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
# Create domain file from corpora
diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl
index 04da69873..5d9b786ad 100755
--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
# Build necessary files for sparse lexical features
diff --git a/scripts/ems/support/consolidate-training-data.perl b/scripts/ems/support/consolidate-training-data.perl
index f312b1649..170ba999c 100755
--- a/scripts/ems/support/consolidate-training-data.perl
+++ b/scripts/ems/support/consolidate-training-data.perl
@@ -2,6 +2,7 @@
# $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $
+use warnings;
use strict;
my ($in,$out,$consolidated,@PART) = @ARGV;
diff --git a/scripts/ems/support/generic-multicore-parallelizer.perl b/scripts/ems/support/generic-multicore-parallelizer.perl
index 073e0f62e..e5a12adce 100755
--- a/scripts/ems/support/generic-multicore-parallelizer.perl
+++ b/scripts/ems/support/generic-multicore-parallelizer.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $cores = 8;
diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl
index db4d2f492..0b248be7e 100755
--- a/scripts/ems/support/generic-parallelizer.perl
+++ b/scripts/ems/support/generic-parallelizer.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $jobs = 20;
diff --git a/scripts/ems/support/input-from-sgm.perl b/scripts/ems/support/input-from-sgm.perl
index 81f177d6c..223996676 100755
--- a/scripts/ems/support/input-from-sgm.perl
+++ b/scripts/ems/support/input-from-sgm.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
die("ERROR syntax: input-from-sgm.perl < in.sgm > in.txt")
diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl
index 34bd2219d..a2fe62b22 100755
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use IPC::Open3;
use File::Temp qw/tempdir/;
diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl
index 0fa676ce8..eadca6263 100755
--- a/scripts/ems/support/lmplz-wrapper.perl
+++ b/scripts/ems/support/lmplz-wrapper.perl
@@ -1,10 +1,13 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
+Getopt::Long::config("no_auto_abbrev");
Getopt::Long::config("pass_through");
+
my ($TEXT,$ORDER,$BIN,$LM);
&GetOptions('text=s' => \$TEXT,
@@ -15,8 +18,9 @@ my ($TEXT,$ORDER,$BIN,$LM);
die("ERROR: specify at least --bin BIN --text CORPUS --lm LM and --order N!")
unless defined($BIN) && defined($TEXT) && defined($LM) && defined($ORDER);
-my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM";
-$cmd .= " " . join(' ', @ARGV) if scalar(@ARGV); # Pass remaining args through.
+my $settings = join(' ', @ARGV);
+#print STDERR "settngs=$settings \n";
+my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM $settings";
print "exec: $cmd\n";
`$cmd`;
diff --git a/scripts/ems/support/mml-filter.perl b/scripts/ems/support/mml-filter.perl
index 5b6e02834..c50725aae 100755
--- a/scripts/ems/support/mml-filter.perl
+++ b/scripts/ems/support/mml-filter.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use FindBin qw($RealBin);
diff --git a/scripts/ems/support/mml-score.perl b/scripts/ems/support/mml-score.perl
index 1fe065586..449d6a05c 100755
--- a/scripts/ems/support/mml-score.perl
+++ b/scripts/ems/support/mml-score.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
#
diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl
index aacf153a7..1f0548082 100755
--- a/scripts/ems/support/mml-train.perl
+++ b/scripts/ems/support/mml-train.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($indomain_source,,$indomain_target,$outdomain_source,$outdomain_target,$lm_training,$lm_binarizer,$order,$lm_settings,$line_count,$model);
diff --git a/scripts/ems/support/prepare-fast-align.perl b/scripts/ems/support/prepare-fast-align.perl
index 1d6e75422..54c124af0 100755
--- a/scripts/ems/support/prepare-fast-align.perl
+++ b/scripts/ems/support/prepare-fast-align.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($source_file,$target_file,$alignment_factors) = @ARGV;
@@ -22,7 +23,7 @@ while(my $source = <SOURCE>) {
# remove markup
foreach my $line (\$source,\$target) {
- $$line =~ s/\<[^\>]+\>//g;
+ $$line =~ s/\<[^\>]+\>/ /g;
$$line =~ s/\s+/ /g;
$$line =~ s/^ //;
$$line =~ s/ $//;
diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl
index 0749648c0..595226bf1 100755
--- a/scripts/ems/support/reference-from-sgm.perl
+++ b/scripts/ems/support/reference-from-sgm.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
die("ERROR syntax: reference-from-sgm.perl ref src out")
diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl
index b345c9a7e..d6333f813 100755
--- a/scripts/ems/support/remove-segmentation-markup.perl
+++ b/scripts/ems/support/remove-segmentation-markup.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
$|++;
diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl
index 5bcf32f48..2e433f291 100755
--- a/scripts/ems/support/report-experiment-scores.perl
+++ b/scripts/ems/support/report-experiment-scores.perl
@@ -2,6 +2,7 @@
# $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $
+use warnings;
use strict;
my $email;
diff --git a/scripts/ems/support/run-command-on-multiple-refsets.perl b/scripts/ems/support/run-command-on-multiple-refsets.perl
index f8e211582..c3db3c4dc 100755
--- a/scripts/ems/support/run-command-on-multiple-refsets.perl
+++ b/scripts/ems/support/run-command-on-multiple-refsets.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
die("ERROR: syntax: run-command-on-multiple-refsets.perl cmd in out")
diff --git a/scripts/ems/support/run-wade.perl b/scripts/ems/support/run-wade.perl
index cf4121a14..25cda3bb3 100755
--- a/scripts/ems/support/run-wade.perl
+++ b/scripts/ems/support/run-wade.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use File::Temp qw/ tempfile tempdir /;
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 6537e84b3..f1af451b3 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -6,6 +6,7 @@ binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
+use warnings;
use FindBin qw($RealBin);
use strict;
diff --git a/scripts/ems/support/submit-grid.perl b/scripts/ems/support/submit-grid.perl
index 6e6193674..9997241e7 100755
--- a/scripts/ems/support/submit-grid.perl
+++ b/scripts/ems/support/submit-grid.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Cwd;
use FindBin qw($RealBin);
@@ -37,7 +38,7 @@ print $runFile "export PATH=\"$path\"\n\n";
print $runFile "export PERL5LIB=\"/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1:/home/$user/perl5/lib/perl5\"\n\n";
print $runFile "module load NYUAD/2.0 \n";
-print $runFile "module load gcc/4.9.1 python/2.7.9 openmpi/1.8.3 boost cmake zlib jdk perl expat\n\n";
+print $runFile "module load gcc python/2.7.9 boost cmake zlib jdk perl expat \n\n";
my $emsDir = dirname($RealBin);
diff --git a/scripts/ems/support/substitute-filtered-tables-and-weights.perl b/scripts/ems/support/substitute-filtered-tables-and-weights.perl
index 9c06b54f8..681d251c7 100755
--- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl
+++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl
index eee454728..e7d9f55f8 100755
--- a/scripts/ems/support/substitute-filtered-tables.perl
+++ b/scripts/ems/support/substitute-filtered-tables.perl
@@ -1,5 +1,7 @@
#!/usr/bin/env perl
+use warnings;
+
# experiment.perl support script
# get filtered rule and reordering tables and place them into a configuration file
diff --git a/scripts/ems/support/substitute-weights.perl b/scripts/ems/support/substitute-weights.perl
index 24ac034e8..42357ed1e 100755
--- a/scripts/ems/support/substitute-weights.perl
+++ b/scripts/ems/support/substitute-weights.perl
@@ -1,5 +1,7 @@
#!/usr/bin/env perl
+use warnings;
+
# experiment.perl support script
# get filtered rule and reordering tables and place them into a configuration file
diff --git a/scripts/ems/support/symmetrize-fast-align.perl b/scripts/ems/support/symmetrize-fast-align.perl
index f93af642d..90621dea9 100755
--- a/scripts/ems/support/symmetrize-fast-align.perl
+++ b/scripts/ems/support/symmetrize-fast-align.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
die("ERROR: syntax is fastalign2bal.perl direct-alignment inverse-alignment source-file target-file out-stem symmetrization-method symal\n") unless scalar(@ARGV) == 7;
diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl
index bd1f89c7b..222623c5b 100755
--- a/scripts/ems/support/thot-lm-wrapper.perl
+++ b/scripts/ems/support/thot-lm-wrapper.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl
index 587e4c541..28708a62a 100755
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($language,$src,$system) = @ARGV;
diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php
index a64d5977f..00bb9e15f 100644
--- a/scripts/ems/web/analysis.php
+++ b/scripts/ems/web/analysis.php
@@ -1261,8 +1261,8 @@ function input_annotation($sentence,$input,$segmentation,$filter) {
for($j=$from;$j<=$to;$j++) {
if ($j>$from) { $phrase .= " "; }
$phrase .= $word[$j];
- $highlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='#ffff80';";
- $lowlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
+ $highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';";
+ $lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
}
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($phrase)."');\"":"").">";
}
@@ -1443,10 +1443,10 @@ function biconcor($query) {
$sentence = $_GET['sentence'];
$biconcor = get_biconcor_version($dir,$set,$id);
print "<center>
-<form method=get id=\"BiconcorForm\">
+<form method=\"get\" id=\"BiconcorForm\" onsubmit=\"return false;\">
<img src=\"close.gif\" width=17 height=17 onClick=\"close_biconcor($sentence);\">
<input width=20 id=\"BiconcorQuery\" value=\"$query\">
-<input type=submit onclick=\"show_biconcor($sentence,encodeBase64(document.getElementById('BiconcorQuery').value));\" value=\"look up\">
+<input type=submit onclick=\"show_biconcor($sentence,Base64.encode(document.getElementById('BiconcorQuery').value));\" value=\"look up\">
</form>
<div class=\"biconcor-content\">";
$cmd = "./biconcor -html -l $dir/model/biconcor.$biconcor -Q ".base64_encode($query)." 2>/dev/null";
diff --git a/scripts/ems/web/base64.js b/scripts/ems/web/base64.js
index e0e94d765..67fd9ad8d 100644
--- a/scripts/ems/web/base64.js
+++ b/scripts/ems/web/base64.js
@@ -1,108 +1,193 @@
-var END_OF_INPUT = -1;
+/*
+ * $Id: base64.js,v 2.15 2014/04/05 12:58:57 dankogai Exp dankogai $
+ *
+ * Licensed under the MIT license.
+ * http://opensource.org/licenses/mit-license
+ *
+ * References:
+ * http://en.wikipedia.org/wiki/Base64
+ */
-var base64Chars = new Array(
- 'A','B','C','D','E','F','G','H',
- 'I','J','K','L','M','N','O','P',
- 'Q','R','S','T','U','V','W','X',
- 'Y','Z','a','b','c','d','e','f',
- 'g','h','i','j','k','l','m','n',
- 'o','p','q','r','s','t','u','v',
- 'w','x','y','z','0','1','2','3',
- '4','5','6','7','8','9','+','/'
-);
-
-var reverseBase64Chars = new Array();
-for (var i=0; i < base64Chars.length; i++){
- reverseBase64Chars[base64Chars[i]] = i;
-}
-
-var base64Str;
-var base64Count;
-function setBase64Str(str){
- base64Str = str;
- base64Count = 0;
-}
-function readBase64(){
- if (!base64Str) return END_OF_INPUT;
- if (base64Count >= base64Str.length) return END_OF_INPUT;
- var c = base64Str.charCodeAt(base64Count) & 0xff;
- base64Count++;
- return c;
-}
-function encodeBase64(str){
- setBase64Str(str);
- var result = '';
- var inBuffer = new Array(3);
- var lineCount = 0;
- var done = false;
- while (!done && (inBuffer[0] = readBase64()) != END_OF_INPUT){
- inBuffer[1] = readBase64();
- inBuffer[2] = readBase64();
- result += (base64Chars[ inBuffer[0] >> 2 ]);
- if (inBuffer[1] != END_OF_INPUT){
- result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30) | (inBuffer[1] >> 4) ]);
- if (inBuffer[2] != END_OF_INPUT){
- result += (base64Chars [((inBuffer[1] << 2) & 0x3c) | (inBuffer[2] >> 6) ]);
- result += (base64Chars [inBuffer[2] & 0x3F]);
- } else {
- result += (base64Chars [((inBuffer[1] << 2) & 0x3c)]);
- result += ('=');
- done = true;
- }
+(function(global) {
+ 'use strict';
+ // existing version for noConflict()
+ var _Base64 = global.Base64;
+ var version = "2.1.7";
+ // if node.js, we use Buffer
+ var buffer;
+ if (typeof module !== 'undefined' && module.exports) {
+ buffer = require('buffer').Buffer;
+ }
+ // constants
+ var b64chars
+ = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
+ var b64tab = function(bin) {
+ var t = {};
+ for (var i = 0, l = bin.length; i < l; i++) t[bin.charAt(i)] = i;
+ return t;
+ }(b64chars);
+ var fromCharCode = String.fromCharCode;
+ // encoder stuff
+ var cb_utob = function(c) {
+ if (c.length < 2) {
+ var cc = c.charCodeAt(0);
+ return cc < 0x80 ? c
+ : cc < 0x800 ? (fromCharCode(0xc0 | (cc >>> 6))
+ + fromCharCode(0x80 | (cc & 0x3f)))
+ : (fromCharCode(0xe0 | ((cc >>> 12) & 0x0f))
+ + fromCharCode(0x80 | ((cc >>> 6) & 0x3f))
+ + fromCharCode(0x80 | ( cc & 0x3f)));
} else {
- result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30)]);
- result += ('=');
- result += ('=');
- done = true;
- }
- lineCount += 4;
- if (lineCount >= 76){
- result += ('\n');
- lineCount = 0;
+ var cc = 0x10000
+ + (c.charCodeAt(0) - 0xD800) * 0x400
+ + (c.charCodeAt(1) - 0xDC00);
+ return (fromCharCode(0xf0 | ((cc >>> 18) & 0x07))
+ + fromCharCode(0x80 | ((cc >>> 12) & 0x3f))
+ + fromCharCode(0x80 | ((cc >>> 6) & 0x3f))
+ + fromCharCode(0x80 | ( cc & 0x3f)));
}
+ };
+ var re_utob = /[\uD800-\uDBFF][\uDC00-\uDFFFF]|[^\x00-\x7F]/g;
+ var utob = function(u) {
+ return u.replace(re_utob, cb_utob);
+ };
+ var cb_encode = function(ccc) {
+ var padlen = [0, 2, 1][ccc.length % 3],
+ ord = ccc.charCodeAt(0) << 16
+ | ((ccc.length > 1 ? ccc.charCodeAt(1) : 0) << 8)
+ | ((ccc.length > 2 ? ccc.charCodeAt(2) : 0)),
+ chars = [
+ b64chars.charAt( ord >>> 18),
+ b64chars.charAt((ord >>> 12) & 63),
+ padlen >= 2 ? '=' : b64chars.charAt((ord >>> 6) & 63),
+ padlen >= 1 ? '=' : b64chars.charAt(ord & 63)
+ ];
+ return chars.join('');
+ };
+ var btoa = global.btoa ? function(b) {
+ return global.btoa(b);
+ } : function(b) {
+ return b.replace(/[\s\S]{1,3}/g, cb_encode);
+ };
+ var _encode = buffer ? function (u) {
+ return (u.constructor === buffer.constructor ? u : new buffer(u))
+ .toString('base64')
}
- return result;
-}
-function readReverseBase64(){
- if (!base64Str) return END_OF_INPUT;
- while (true){
- if (base64Count >= base64Str.length) return END_OF_INPUT;
- var nextCharacter = base64Str.charAt(base64Count);
- base64Count++;
- if (reverseBase64Chars[nextCharacter]){
- return reverseBase64Chars[nextCharacter];
+ : function (u) { return btoa(utob(u)) }
+ ;
+ var encode = function(u, urisafe) {
+ return !urisafe
+ ? _encode(String(u))
+ : _encode(String(u)).replace(/[+\/]/g, function(m0) {
+ return m0 == '+' ? '-' : '_';
+ }).replace(/=/g, '');
+ };
+ var encodeURI = function(u) { return encode(u, true) };
+ // decoder stuff
+ var re_btou = new RegExp([
+ '[\xC0-\xDF][\x80-\xBF]',
+ '[\xE0-\xEF][\x80-\xBF]{2}',
+ '[\xF0-\xF7][\x80-\xBF]{3}'
+ ].join('|'), 'g');
+ var cb_btou = function(cccc) {
+ switch(cccc.length) {
+ case 4:
+ var cp = ((0x07 & cccc.charCodeAt(0)) << 18)
+ | ((0x3f & cccc.charCodeAt(1)) << 12)
+ | ((0x3f & cccc.charCodeAt(2)) << 6)
+ | (0x3f & cccc.charCodeAt(3)),
+ offset = cp - 0x10000;
+ return (fromCharCode((offset >>> 10) + 0xD800)
+ + fromCharCode((offset & 0x3FF) + 0xDC00));
+ case 3:
+ return fromCharCode(
+ ((0x0f & cccc.charCodeAt(0)) << 12)
+ | ((0x3f & cccc.charCodeAt(1)) << 6)
+ | (0x3f & cccc.charCodeAt(2))
+ );
+ default:
+ return fromCharCode(
+ ((0x1f & cccc.charCodeAt(0)) << 6)
+ | (0x3f & cccc.charCodeAt(1))
+ );
}
- if (nextCharacter == 'A') return 0;
+ };
+ var btou = function(b) {
+ return b.replace(re_btou, cb_btou);
+ };
+ var cb_decode = function(cccc) {
+ var len = cccc.length,
+ padlen = len % 4,
+ n = (len > 0 ? b64tab[cccc.charAt(0)] << 18 : 0)
+ | (len > 1 ? b64tab[cccc.charAt(1)] << 12 : 0)
+ | (len > 2 ? b64tab[cccc.charAt(2)] << 6 : 0)
+ | (len > 3 ? b64tab[cccc.charAt(3)] : 0),
+ chars = [
+ fromCharCode( n >>> 16),
+ fromCharCode((n >>> 8) & 0xff),
+ fromCharCode( n & 0xff)
+ ];
+ chars.length -= [0, 0, 2, 1][padlen];
+ return chars.join('');
+ };
+ var atob = global.atob ? function(a) {
+ return global.atob(a);
+ } : function(a){
+ return a.replace(/[\s\S]{1,4}/g, cb_decode);
+ };
+ var _decode = buffer ? function(a) {
+ return (a.constructor === buffer.constructor
+ ? a : new buffer(a, 'base64')).toString();
}
- return END_OF_INPUT;
-}
-function ntos(n){
- n=n.toString(16);
- if (n.length == 1) n="0"+n;
- n="%"+n;
- return unescape(n);
-}
-
-function decodeBase64(str){
- setBase64Str(str);
- var result = "";
- var inBuffer = new Array(4);
- var done = false;
- while (!done && (inBuffer[0] = readReverseBase64()) != END_OF_INPUT
- && (inBuffer[1] = readReverseBase64()) != END_OF_INPUT){
- inBuffer[2] = readReverseBase64();
- inBuffer[3] = readReverseBase64();
- result += ntos((((inBuffer[0] << 2) & 0xff)| inBuffer[1] >> 4));
- if (inBuffer[2] != END_OF_INPUT){
- result += ntos((((inBuffer[1] << 4) & 0xff)| inBuffer[2] >> 2));
- if (inBuffer[3] != END_OF_INPUT){
- result += ntos((((inBuffer[2] << 6) & 0xff) | inBuffer[3]));
- } else {
- done = true;
- }
- } else {
- done = true;
- }
+ : function(a) { return btou(atob(a)) };
+ var decode = function(a){
+ return _decode(
+ String(a).replace(/[-_]/g, function(m0) { return m0 == '-' ? '+' : '/' })
+ .replace(/[^A-Za-z0-9\+\/]/g, '')
+ );
+ };
+ var noConflict = function() {
+ var Base64 = global.Base64;
+ global.Base64 = _Base64;
+ return Base64;
+ };
+ // export Base64
+ global.Base64 = {
+ VERSION: version,
+ atob: atob,
+ btoa: btoa,
+ fromBase64: decode,
+ toBase64: encode,
+ utob: utob,
+ encode: encode,
+ encodeURI: encodeURI,
+ btou: btou,
+ decode: decode,
+ noConflict: noConflict
+ };
+ // if ES5 is available, make Base64.extendString() available
+ if (typeof Object.defineProperty === 'function') {
+ var noEnum = function(v){
+ return {value:v,enumerable:false,writable:true,configurable:true};
+ };
+ global.Base64.extendString = function () {
+ Object.defineProperty(
+ String.prototype, 'fromBase64', noEnum(function () {
+ return decode(this)
+ }));
+ Object.defineProperty(
+ String.prototype, 'toBase64', noEnum(function (urisafe) {
+ return encode(this, urisafe)
+ }));
+ Object.defineProperty(
+ String.prototype, 'toBase64URI', noEnum(function () {
+ return encode(this, true)
+ }));
+ };
}
- return result;
+ // that's it!
+})(this);
+
+if (this['Meteor']) {
+ Base64 = global.Base64; // for normal export in Meteor.js
}
diff --git a/scripts/ems/web/bilingual-concordance.css b/scripts/ems/web/bilingual-concordance.css
index e232337d2..4648a21dd 100644
--- a/scripts/ems/web/bilingual-concordance.css
+++ b/scripts/ems/web/bilingual-concordance.css
@@ -93,5 +93,6 @@ span.mismatch_aligned {
td.pp_more {
font-size: 70%;
+ color: navy;
text-align: center;
}
diff --git a/scripts/ems/web/index.php b/scripts/ems/web/index.php
index 6b785cf3f..d216b114a 100644
--- a/scripts/ems/web/index.php
+++ b/scripts/ems/web/index.php
@@ -8,7 +8,7 @@ require("diff.php");
require("sgviz.php");
function head($title) {
- print '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+ print '<!DOCTYPE html>
<html><head><title>'.$title.'</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<script language="javascript" src="javascripts/prototype.js"></script>
diff --git a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc
index 21f8c8cf6..57f78eb53 100644
--- a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc
+++ b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc
@@ -32,8 +32,8 @@ in a directory of your website, e.g. /javascripts.
Now, you can include the scripts by adding the following
tags to the HEAD section of your HTML pages:
- <script src="/javascripts/prototype.js" type="text/javascript"></script>
- <script src="/javascripts/scriptaculous.js" type="text/javascript"></script>
+ <script src="javascripts/prototype.js" type="text/javascript"></script>
+ <script src="javascripts/scriptaculous.js" type="text/javascript"></script>
scriptaculous.js will automatically load the other files of the
script.aculo.us distribution in, provided they are accessible
@@ -56,4 +56,4 @@ the sources of the examples provided.
== License
script.aculo.us is licensed under the terms of the MIT License,
-see the included MIT-LICENSE file. \ No newline at end of file
+see the included MIT-LICENSE file.
diff --git a/scripts/ems/web/overview.php b/scripts/ems/web/overview.php
index e56ed6f08..ce0434bb8 100644
--- a/scripts/ems/web/overview.php
+++ b/scripts/ems/web/overview.php
@@ -1,6 +1,5 @@
<?php
-date_default_timezone_set('Europe/London');
function setup() {
$setup = file("setup");
@@ -13,7 +12,7 @@ function setup() {
print "<TR><TD><A HREF=\"?setup=$dir[0]\">$dir[0]</A></TD><TD>$dir[1]</TD><TD>$dir[2]</TD><TD>$dir[3]</TD></TR>\n";
}
print "</TABLE>\n";
- print "<P>To add experiment, edit /fs/thor4/html/experiment/setup";
+ print "<p>To add experiment, edit the \"setup\" file.</p>";
}
function overview() {
@@ -26,7 +25,7 @@ function overview() {
head("Task: $task ($user)");
print "<a href=\"http://www.statmt.org/wiki/?n=Experiment.$setup\">Wiki Notes</a>";
- print " &nbsp; &nbsp; | &nbsp; &nbsp; <a href=\"/\">Overview of experiments</a> &nbsp; &nbsp; | &nbsp; &nbsp; <code>$dir</code><p>";
+ print " &nbsp; &nbsp; | &nbsp; &nbsp; <a href=\"?\">Overview of experiments</a> &nbsp; &nbsp; | &nbsp; &nbsp; <code>$dir</code><p>";
reset($experiment);
print "<form action=\"\" method=get>\n";
diff --git a/scripts/ems/web/progress.perl b/scripts/ems/web/progress.perl
index 0612a0a44..fd742e410 100755
--- a/scripts/ems/web/progress.perl
+++ b/scripts/ems/web/progress.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Date::Parse;
diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl
index 56d4dff0f..80a1b3120 100755
--- a/scripts/fuzzy-match/create_xml.perl
+++ b/scripts/fuzzy-match/create_xml.perl
@@ -3,6 +3,7 @@
binmode( STDIN, ":utf8" );
binmode( STDOUT, ":utf8" );
+use warnings;
use strict;
use FindBin qw($RealBin);
use File::Basename;
diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl
index bbbccc8ef..c0b25f519 100755
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/generic/extract-factors.pl b/scripts/generic/extract-factors.pl
index 566849053..56c719051 100755
--- a/scripts/generic/extract-factors.pl
+++ b/scripts/generic/extract-factors.pl
@@ -6,6 +6,7 @@
#factor indices start at 0
#factor indices too large ought to be ignored
+use warnings;
use strict;
my ($filename, @factors) = @ARGV;
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index 687a21e28..2b02fa869 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -3,6 +3,7 @@
# example
# ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput
+use warnings;
use strict;
use File::Basename;
@@ -32,8 +33,8 @@ my $glueFile;
my $phraseOrientation = 0;
my $phraseOrientationPriorsFile;
-my $GZIP_EXEC; # = which("pigz");
-if(-f "/usr/bin/pigz") {
+my $GZIP_EXEC;
+if(`which pigz`) {
$GZIP_EXEC = 'pigz';
}
else {
diff --git a/scripts/generic/fsa2fsal.pl b/scripts/generic/fsa2fsal.pl
index 8cfdc0462..50bff1404 100755
--- a/scripts/generic/fsa2fsal.pl
+++ b/scripts/generic/fsa2fsal.pl
@@ -5,6 +5,7 @@
# Some rudimentary sanity checks are done on the fly.
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+use warnings;
use strict;
my $errs = 0;
diff --git a/scripts/generic/fsa2plf.pl b/scripts/generic/fsa2plf.pl
index 1177b01d5..4e7454a9f 100755
--- a/scripts/generic/fsa2plf.pl
+++ b/scripts/generic/fsa2plf.pl
@@ -8,6 +8,7 @@
# Note that the output format may not contain any spaces.
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+use warnings;
use strict;
use Getopt::Long;
diff --git a/scripts/generic/fsal2fsa.pl b/scripts/generic/fsal2fsa.pl
index 26258587d..d1aa461ac 100755
--- a/scripts/generic/fsal2fsa.pl
+++ b/scripts/generic/fsal2fsa.pl
@@ -2,6 +2,7 @@
# A very simple script that converts fsal back to fsa format (openfst lattices)
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+use warnings;
use strict;
while (<>) {
diff --git a/scripts/generic/generic-parallel.perl b/scripts/generic/generic-parallel.perl
index b7dca1bc9..653912c5c 100755
--- a/scripts/generic/generic-parallel.perl
+++ b/scripts/generic/generic-parallel.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
diff --git a/scripts/generic/giza-parallel.perl b/scripts/generic/giza-parallel.perl
index b5575e4d0..8793d3d8e 100755
--- a/scripts/generic/giza-parallel.perl
+++ b/scripts/generic/giza-parallel.perl
@@ -3,6 +3,7 @@
# example
# ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align
+use warnings;
use strict;
use File::Basename;
diff --git a/scripts/generic/lopar2pos.pl b/scripts/generic/lopar2pos.pl
index d95389c05..c75069135 100755
--- a/scripts/generic/lopar2pos.pl
+++ b/scripts/generic/lopar2pos.pl
@@ -4,6 +4,8 @@
#lopar2pos: extract POSs from LOPAR output
#usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos
+use warnings;
+
my $infilename = shift @ARGV;
open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n";
while(my $line = <INFILE>)
diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl
index 4890864aa..7c0f56c70 100755
--- a/scripts/generic/moses-parallel.pl
+++ b/scripts/generic/moses-parallel.pl
@@ -15,6 +15,7 @@
# added checks for existence of decoder and configuration file
# 26 Jul 2006 fix a bug related to the use of absolute path for srcfile and nbestfile
+use warnings;
use strict;
#######################
diff --git a/scripts/generic/mteval-v12.pl b/scripts/generic/mteval-v12.pl
index 0c771fc77..360376242 100755
--- a/scripts/generic/mteval-v12.pl
+++ b/scripts/generic/mteval-v12.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use utf8;
use Encode;
diff --git a/scripts/generic/multi-bleu.perl b/scripts/generic/multi-bleu.perl
index 5ed6add74..2f44d419f 100755
--- a/scripts/generic/multi-bleu.perl
+++ b/scripts/generic/multi-bleu.perl
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
my $lowercase = 0;
diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl
index b33cd2805..ea56927ac 100755
--- a/scripts/generic/ph_numbers.perl
+++ b/scripts/generic/ph_numbers.perl
@@ -7,6 +7,7 @@ package ph_numbers;
#
# (c) 2013 TAUS
+use warnings;
use strict;
run() unless caller();
diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl
index c5b63a71b..622323bdb 100755
--- a/scripts/generic/qsub-wrapper.pl
+++ b/scripts/generic/qsub-wrapper.pl
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
#######################
diff --git a/scripts/generic/reverse-alignment.perl b/scripts/generic/reverse-alignment.perl
index fc8c33dff..d00140c74 100755
--- a/scripts/generic/reverse-alignment.perl
+++ b/scripts/generic/reverse-alignment.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $line;
diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl
index d6f16b2fc..9e5ee0025 100755
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@@ -4,6 +4,7 @@
# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0
# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f --Inverse 1
+use warnings;
use strict;
use File::Basename;
@@ -13,8 +14,8 @@ sub GetSourcePhrase($);
sub NumStr($);
sub CutContextFile($$$);
-my $GZIP_EXEC; # = which("pigz");
-if(-f "/usr/bin/pigz") {
+my $GZIP_EXEC;
+if(`which pigz`) {
$GZIP_EXEC = 'pigz';
}
else {
diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl
index 61b823ce2..95513b608 100755
--- a/scripts/generic/strip-xml.perl
+++ b/scripts/generic/strip-xml.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
while (my $line = <STDIN>) {
diff --git a/scripts/generic/trainlm-irst2.perl b/scripts/generic/trainlm-irst2.perl
index a84ea1c61..596143386 100755
--- a/scripts/generic/trainlm-irst2.perl
+++ b/scripts/generic/trainlm-irst2.perl
@@ -10,6 +10,7 @@
# irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
# Set smoothing method in settings, if different from modified Kneser-Ney
+use warnings;
use strict;
use FindBin qw($RealBin);
use Getopt::Long;
diff --git a/scripts/generic/trainlm-lmplz.perl b/scripts/generic/trainlm-lmplz.perl
deleted file mode 100755
index 045248675..000000000
--- a/scripts/generic/trainlm-lmplz.perl
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env perl
-
-# Compatible with sri LM-creating script, eg.
-# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
-# To use it in the EMS, add this to the [LM] section
-# lm-training = "$moses-script-dir/generic/trainlm-lmplz.perl -lmplz $lmplz"
-# settings = "-T $working-dir/tmp -S 10G"
-# Also, make sure that $lmplz is defined (in the [LM] or [GENERAL] section.
-# It should point to the binary file
-# lmplz = /home/waziz/workspace/github/moses/bin/lmplz
-
-use strict;
-use FindBin qw($RealBin);
-use Getopt::Long qw/GetOptionsFromArray/;
-#use Getopt::Long;
-Getopt::Long::Configure("pass_through", "no_ignore_case");
-
-my $order = 3; # order of language model (default trigram)
-my $corpus; # input text data
-my $lm; # generated language model
-my $lmplz; # bin directory of IRSTLM
-my $help = 0;
-
-my @optconfig = (
- "-order=s" => \$order,
- "-text=s" => \$corpus,
- "-lm=s" => \$lm,
- "-lmplz=s" => \$lmplz,
-);
-
-GetOptionsFromArray(\@ARGV, @optconfig);
-die("ERROR: please set text") unless defined($corpus);
-die("ERROR: please set lm") unless defined($lm);
-die("ERROR: please set lmplz") unless defined($lmplz);
-
-my $settings = join(' ', @ARGV);
-my $cmd = "$lmplz --order $order $settings < $corpus > $lm";
-
-print STDERR "EXECUTING $cmd\n";
-`$cmd`;
diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl
index 73ea51beb..130afd56b 100755
--- a/scripts/other/beautify.perl
+++ b/scripts/other/beautify.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use File::Basename;
use FindBin qw($RealBin);
diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl
index c0b723d64..08316c95b 100755
--- a/scripts/other/delete-scores.perl
+++ b/scripts/other/delete-scores.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/other/get_many_translations_from_google.perl b/scripts/other/get_many_translations_from_google.perl
index 6ef83e240..512b84e36 100755
--- a/scripts/other/get_many_translations_from_google.perl
+++ b/scripts/other/get_many_translations_from_google.perl
@@ -6,6 +6,7 @@
#
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+use warnings;
use strict;
use Getopt::Long;
use CGI;
diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl
index efa5e12b6..549cd8abe 100755
--- a/scripts/recaser/detruecase.perl
+++ b/scripts/recaser/detruecase.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl
index 0b1ded200..3ba83712a 100755
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl
index 27c5da198..87a720f6e 100755
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
use FindBin qw($Bin);
use Getopt::Long "GetOptions";
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index b6e5c3884..b653a8ca5 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -8,6 +8,7 @@
# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
#
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl
index d14d7ebe4..373aa509f 100755
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@@ -1,6 +1,8 @@
#!/usr/bin/env perl
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
+
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/regression-testing/compare-results.pl b/scripts/regression-testing/compare-results.pl
index 0d77ef8fc..df14d444f 100755
--- a/scripts/regression-testing/compare-results.pl
+++ b/scripts/regression-testing/compare-results.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($results, $truth) = @ARGV;
diff --git a/scripts/regression-testing/create_localized_moses_ini.pl b/scripts/regression-testing/create_localized_moses_ini.pl
index 78a033b32..612a39e82 100755
--- a/scripts/regression-testing/create_localized_moses_ini.pl
+++ b/scripts/regression-testing/create_localized_moses_ini.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use MosesScriptsRegressionTesting;
diff --git a/scripts/regression-testing/modify-pars.pl b/scripts/regression-testing/modify-pars.pl
index 4669ae0b6..5ad2514a4 100755
--- a/scripts/regression-testing/modify-pars.pl
+++ b/scripts/regression-testing/modify-pars.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $argv=join(" ",@ARGV);
diff --git a/scripts/regression-testing/moses-virtual.pl b/scripts/regression-testing/moses-virtual.pl
index 55198900b..41ddd6b13 100755
--- a/scripts/regression-testing/moses-virtual.pl
+++ b/scripts/regression-testing/moses-virtual.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my %opt = ();
diff --git a/scripts/regression-testing/run-single-test.pl b/scripts/regression-testing/run-single-test.pl
index 2fa7b4dce..bb66e96f6 100755
--- a/scripts/regression-testing/run-single-test.pl
+++ b/scripts/regression-testing/run-single-test.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use MosesScriptsRegressionTesting;
diff --git a/scripts/regression-testing/run-test-suite.pl b/scripts/regression-testing/run-test-suite.pl
index d90dfa35d..8ae9ec60f 100755
--- a/scripts/regression-testing/run-test-suite.pl
+++ b/scripts/regression-testing/run-test-suite.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use Getopt::Long;
diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl
index 17fe650d2..0e73a7718 100755
--- a/scripts/tokenizer/deescape-special-chars-PTB.perl
+++ b/scripts/tokenizer/deescape-special-chars-PTB.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl
index dc810d817..076d1e62f 100755
--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index 14d6666c9..7874d5d04 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -7,6 +7,8 @@
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
+
+use warnings;
use strict;
use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
@@ -36,7 +38,7 @@ if ($HELP) {
exit;
}
-if ($language !~ /^(cs|en|fr|it)$/) {
+if ($language !~ /^(cs|en|fr|it|fi)$/) {
print STDERR "Warning: No built-in rules for language $language.\n"
}
@@ -176,6 +178,11 @@ sub detokenize {
}
+ } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) {
+ # Finnish : without intervening space if followed by case suffix
+ # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
+ $text=$text. lc $words[$i];
+ $prependSpace = " ";
} else {
$text=$text.$prependSpace.$words[$i];
$prependSpace = " ";
diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl
index 79ae39469..e94b91744 100755
--- a/scripts/tokenizer/escape-special-chars.perl
+++ b/scripts/tokenizer/escape-special-chars.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl
index cb1250938..9ee307bc2 100755
--- a/scripts/tokenizer/lowercase.perl
+++ b/scripts/tokenizer/lowercase.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
binmode(STDIN, ":utf8");
diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl
index 8f779449f..db8f9c60e 100755
--- a/scripts/tokenizer/normalize-punctuation.perl
+++ b/scripts/tokenizer/normalize-punctuation.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $language = "en";
diff --git a/scripts/tokenizer/pre-tok-clean.perl b/scripts/tokenizer/pre-tok-clean.perl
new file mode 100755
index 000000000..900e992ee
--- /dev/null
+++ b/scripts/tokenizer/pre-tok-clean.perl
@@ -0,0 +1,46 @@
+#!/usr/bin/env perl
+
+use strict;
+
+my $minChars = $ARGV[0];
+my $maxChars = $ARGV[1];
+my $inputStem = $ARGV[2];
+my $source = $ARGV[3];
+my $target = $ARGV[4];
+my $outputStem = $ARGV[5];
+my $linesRetained = $ARGV[6];
+
+open(IN_SOURCE, "<:encoding(UTF-8)", "$inputStem.$source") or die "cannot open $inputStem.$source";
+open(IN_TARGET, "<:encoding(UTF-8)", "$inputStem.$target") or die "cannot open $inputStem.$target";
+
+open(OUT_SOURCE, ">:encoding(UTF-8)", "$outputStem.$source") or die "cannot open $outputStem.$source";
+open(OUT_TARGET, ">:encoding(UTF-8)", "$outputStem.$target") or die "cannot open $outputStem.$target";
+
+open(LINE_RETAINED, ">:encoding(UTF-8)", "$linesRetained");
+
+my $lineNum = 0;
+while (my $lineSource = <IN_SOURCE>) {
+ ++$lineNum;
+ #print STDERR "$lineNum ";
+
+ chomp($lineSource);
+ my $lineTarget = <IN_TARGET>;
+ chomp($lineTarget);
+
+ my $lenSource = length($lineSource);
+ my $lenTarget = length($lineTarget);
+
+ if ($lenSource < $minChars || $lenSource > $maxChars
+ || $lenTarget < $minChars || $lenTarget > $maxChars) {
+ # do nothing
+ }
+ else {
+ print OUT_SOURCE "$lineSource\n";
+ print OUT_TARGET "$lineTarget\n";
+ print LINE_RETAINED "$lineNum\n";
+ }
+}
+
+close(OUT_SOURCE);
+close(OUT_SOURCE);
+close(LINE_RETAINED);
diff --git a/scripts/tokenizer/pre-tokenizer.perl b/scripts/tokenizer/pre-tokenizer.perl
index 35134a9c0..499671b44 100755
--- a/scripts/tokenizer/pre-tokenizer.perl
+++ b/scripts/tokenizer/pre-tokenizer.perl
@@ -4,6 +4,7 @@
# Start by Ulrich Germann, after noticing systematic preprocessing errors
# in some of the English Europarl data.
+use warnings;
use strict;
use Getopt::Std;
diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl
index 4dadd1d77..2b90dfd3b 100755
--- a/scripts/tokenizer/remove-non-printing-char.perl
+++ b/scripts/tokenizer/remove-non-printing-char.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use utf8;
binmode(STDIN, ":utf8");
diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl
index 748e1d063..08eb766bf 100755
--- a/scripts/tokenizer/replace-unicode-punctuation.perl
+++ b/scripts/tokenizer/replace-unicode-punctuation.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
#binmode(STDIN, ":utf8");
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index eeede0af0..8abffbea4 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -16,6 +16,7 @@ use warnings;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
+use warnings;
use FindBin qw($RealBin);
use strict;
use Time::HiRes;
diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl
index 6417b7d6e..bce7a38a0 100755
--- a/scripts/tokenizer/tokenizer_PTB.perl
+++ b/scripts/tokenizer/tokenizer_PTB.perl
@@ -14,6 +14,7 @@
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
+use warnings;
use FindBin qw($RealBin);
use strict;
use Time::HiRes;
diff --git a/scripts/training/absolutize_moses_model.pl b/scripts/training/absolutize_moses_model.pl
index ecfcb3395..5c9c0970a 100755
--- a/scripts/training/absolutize_moses_model.pl
+++ b/scripts/training/absolutize_moses_model.pl
@@ -6,6 +6,8 @@
#
# Ondrej Bojar.
+use warnings;
+
my $ini = shift;
die "usage: absolutize_moses_model.pl path-to-moses.ini > moses.abs.ini"
if !defined $ini;
diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py
index 66f8f0413..cd8755580 100755
--- a/scripts/training/bilingual-lm/extract_training.py
+++ b/scripts/training/bilingual-lm/extract_training.py
@@ -147,7 +147,7 @@ def main():
#Numberize the file
for line in ngrams_file_handle:
- numberized_file_handle.write(extract.numberize(line, m, n, tvocab_idmap, tvocab_idmap))
+ numberized_file_handle.write(extract.numberize(line, options.m, options.n, svocab_idmap, tvocab_idmap))
numberized_file_handle.close()
ngrams_file_handle.close()
diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl
index 0246190f2..3d4798ffd 100755
--- a/scripts/training/binarize-model.perl
+++ b/scripts/training/binarize-model.perl
@@ -4,6 +4,7 @@
# Binarize a Moses model
#
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/training/build-generation-table.perl b/scripts/training/build-generation-table.perl
index 8b1303795..fb59f4acc 100755
--- a/scripts/training/build-generation-table.perl
+++ b/scripts/training/build-generation-table.perl
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
# $Id$
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl
index bd8d1078f..a7ddaff70 100755
--- a/scripts/training/build-mmsapt.perl
+++ b/scripts/training/build-mmsapt.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index 40e4d8935..e1e96528c 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
# $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $
+use warnings;
use strict;
use Getopt::Long;
my $help;
diff --git a/scripts/training/clone_moses_model.pl b/scripts/training/clone_moses_model.pl
index 93e37b803..5e9dff72a 100755
--- a/scripts/training/clone_moses_model.pl
+++ b/scripts/training/clone_moses_model.pl
@@ -5,6 +5,7 @@
# in the current directory
# All relevant files are hardlinked or copied to the directory, too.
+use warnings;
use strict;
use Getopt::Long;
diff --git a/scripts/training/convert-moses-ini-to-v2.perl b/scripts/training/convert-moses-ini-to-v2.perl
index 1bc4fe79d..25c562ef4 100755
--- a/scripts/training/convert-moses-ini-to-v2.perl
+++ b/scripts/training/convert-moses-ini-to-v2.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $header = "";
diff --git a/scripts/training/corpus-sizes.perl b/scripts/training/corpus-sizes.perl
index 1eccf9bd5..02dd4ae9b 100755
--- a/scripts/training/corpus-sizes.perl
+++ b/scripts/training/corpus-sizes.perl
@@ -2,6 +2,7 @@
# $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $
+use warnings;
use strict;
my ($in,$out,@PART) = @ARGV;
diff --git a/scripts/training/exodus.perl b/scripts/training/exodus.perl
index ef3d8df92..d3466f5dd 100755
--- a/scripts/training/exodus.perl
+++ b/scripts/training/exodus.perl
@@ -2,6 +2,7 @@
# $Id$
+use warnings;
use strict;
my @LINE = <STDIN>;
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index dbafc73be..7dec0762c 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -8,6 +8,7 @@
# changes by Ondrej Bojar
# adapted for hierarchical models by Phil Williams
+use warnings;
use strict;
use FindBin qw($RealBin);
diff --git a/scripts/training/get-lexical.perl b/scripts/training/get-lexical.perl
index 2dcf7437f..45fe6d54c 100755
--- a/scripts/training/get-lexical.perl
+++ b/scripts/training/get-lexical.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use FindBin qw($RealBin);
BEGIN { require "$RealBin/LexicalTranslationModel.pm"; "LexicalTranslationModel"->import; }
diff --git a/scripts/training/giza2bal.pl b/scripts/training/giza2bal.pl
index 8b2150e31..56fc9a466 100755
--- a/scripts/training/giza2bal.pl
+++ b/scripts/training/giza2bal.pl
@@ -7,6 +7,8 @@
#Copyright Marcello Federico, November 2004
+#use warnings;
+
($cnt,$dir,$inv)=();
while ($w=shift @ARGV){
@@ -17,7 +19,7 @@ while ($w=shift @ARGV){
my $lc = 0;
-if (!$dir || !inv){
+if (!$dir || !$inv){
print "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n";
print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n";
exit(0);
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 5d1f9b368..86084abbf 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -47,6 +47,7 @@
# 13 Oct 2004 Use alternative decoders (DWC)
# Original version by Philipp Koehn
+use warnings;
use strict;
use FindBin qw($RealBin);
use File::Basename;
diff --git a/scripts/training/postprocess-lopar.perl b/scripts/training/postprocess-lopar.perl
index 9962d5594..5171e02fb 100755
--- a/scripts/training/postprocess-lopar.perl
+++ b/scripts/training/postprocess-lopar.perl
@@ -2,6 +2,7 @@
# $Id$
+use warnings;
use strict;
use utf8;
diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl
index bc08a3a9d..c265652f6 100755
--- a/scripts/training/reduce-factors.perl
+++ b/scripts/training/reduce-factors.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl
index 2f412cd28..bd5d7f1d2 100755
--- a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl
+++ b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($ttable_file) = @ARGV;
diff --git a/scripts/training/strip-xml.perl b/scripts/training/strip-xml.perl
new file mode 100755
index 000000000..0f403d15d
--- /dev/null
+++ b/scripts/training/strip-xml.perl
@@ -0,0 +1,17 @@
+#!/usr/bin/env perl
+
+# strip text file of any XML markup
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use strict;
+
+while(<STDIN>) {
+ s/<\S[^>]*>/ /g;
+ chomp;
+ s/ +/ /g;
+ s/^ //;
+ print $_;
+ print "\n";
+}
diff --git a/scripts/training/threshold-filter.perl b/scripts/training/threshold-filter.perl
index f8d15a8ae..a23fb8b5c 100755
--- a/scripts/training/threshold-filter.perl
+++ b/scripts/training/threshold-filter.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my %MIN_SCORE;
diff --git a/scripts/training/train-global-lexicon-model.perl b/scripts/training/train-global-lexicon-model.perl
index 20ee42b72..0e7d3077d 100755
--- a/scripts/training/train-global-lexicon-model.perl
+++ b/scripts/training/train-global-lexicon-model.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use Switch;
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index ade5c5277..4c355479c 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1,5 +1,6 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
@@ -404,8 +405,8 @@ else {
$SORT_EXEC = 'sort';
}
-my $GZIP_EXEC; # = which("pigz");
-if(-f "/usr/bin/pigz") {
+my $GZIP_EXEC;
+if(`which pigz`) {
$GZIP_EXEC = 'pigz';
}
else {
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
index 3bbf982b7..3dd8fc4ac 100755
--- a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
index 91fc515cb..e61a53652 100755
--- a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
while(<STDIN>) {
diff --git a/scripts/training/wrappers/filter-excluded-lines.perl b/scripts/training/wrappers/filter-excluded-lines.perl
index 2f1e25ad4..7f9da3efa 100755
--- a/scripts/training/wrappers/filter-excluded-lines.perl
+++ b/scripts/training/wrappers/filter-excluded-lines.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long;
diff --git a/scripts/training/wrappers/find-unparseable.perl b/scripts/training/wrappers/find-unparseable.perl
index 0bbf35df4..b0d38027b 100755
--- a/scripts/training/wrappers/find-unparseable.perl
+++ b/scripts/training/wrappers/find-unparseable.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my $lineNum = 1;
diff --git a/scripts/training/wrappers/mada-wrapper.perl b/scripts/training/wrappers/mada-wrapper.perl
index eec10a3ef..20f76f821 100755
--- a/scripts/training/wrappers/mada-wrapper.perl
+++ b/scripts/training/wrappers/mada-wrapper.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use File::Temp qw/tempfile/;
use Getopt::Long "GetOptions";
diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl
new file mode 100755
index 000000000..6e7efe245
--- /dev/null
+++ b/scripts/training/wrappers/madamira-wrapper.perl
@@ -0,0 +1,93 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use File::Temp qw/tempfile/;
+use Getopt::Long "GetOptions";
+use File::Basename;
+use FindBin qw($RealBin);
+use Cwd 'abs_path';
+
+my $TMPDIR = "tmp";
+my $SCHEME = "D2";
+my $KEEP_TMP = 0;
+my $MADA_DIR;
+
+GetOptions(
+ "scheme=s" => \$SCHEME,
+ "tmpdir=s" => \$TMPDIR,
+ "keep-tmp" => \$KEEP_TMP,
+ "mada-dir=s" => \$MADA_DIR
+ ) or die("ERROR: unknown options");
+
+$TMPDIR = abs_path($TMPDIR);
+print STDERR "TMPDIR=$TMPDIR \n";
+
+#binmode(STDIN, ":utf8");
+#binmode(STDOUT, ":utf8");
+
+$TMPDIR = "$TMPDIR/madamira.$$";
+`mkdir -p $TMPDIR`;
+`mkdir -p $TMPDIR/split`;
+`mkdir -p $TMPDIR/out`;
+
+my $infile = "$TMPDIR/input";
+print STDERR $infile."\n";
+
+open(TMP,">$infile");
+while(<STDIN>) {
+ print TMP $_;
+}
+close(TMP);
+
+my $cmd;
+
+# split input file
+my $SPLIT_EXEC = `gsplit --help 2>/dev/null`;
+if($SPLIT_EXEC) {
+ $SPLIT_EXEC = 'gsplit';
+}
+else {
+ $SPLIT_EXEC = 'split';
+}
+
+$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x";
+`$cmd`;
+
+$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml ::: $TMPDIR/split/x*";
+print STDERR "Executing: $cmd\n";
+`$cmd`;
+
+$cmd = "cat $TMPDIR/out/x*.mada > $infile.mada";
+print STDERR "Executing: $cmd\n";
+`$cmd`;
+
+# get stuff out of mada output
+open(MADA_OUT,"<$infile.mada");
+#binmode(MADA_OUT, ":utf8");
+while(my $line = <MADA_OUT>) {
+ chop($line);
+ #print STDERR "line=$line \n";
+
+ if (index($line, "SENTENCE BREAK") == 0) {
+ # new sentence
+ #print STDERR "BREAK\n";
+ print "\n";
+ }
+ elsif (index($line, ";;WORD") == 0) {
+ # word
+ my $word = substr($line, 7, length($line) - 8);
+ #print STDERR "FOund $word\n";
+ print "$word ";
+ }
+ else {
+ #print STDERR "NADA\n";
+ }
+}
+close (MADA_OUT);
+
+
+if ($KEEP_TMP == 0) {
+# `rm -rf $TMPDIR`;
+}
+
diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
index cf7473e44..88d16b3f6 100755
--- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
+++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($lowercase, $cluster_file,$in,$out,$tmp) = @ARGV;
diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl
index 4b2c90495..1cc917bce 100755
--- a/scripts/training/wrappers/make-factor-de-morph.perl
+++ b/scripts/training/wrappers/make-factor-de-morph.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Encode;
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl
index 8cc28695a..2eadd4123 100755
--- a/scripts/training/wrappers/make-factor-de-pos.perl
+++ b/scripts/training/wrappers/make-factor-de-pos.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($in,$out,$tmpdir) = @ARGV;
diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
index 3ab2b1ca4..0d27aa12f 100755
--- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
+++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl
index 1e00a8fa3..2af6eb75c 100755
--- a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl
+++ b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
# handle switches
diff --git a/scripts/training/wrappers/make-factor-stem.perl b/scripts/training/wrappers/make-factor-stem.perl
index c222ad0df..60aca0b34 100755
--- a/scripts/training/wrappers/make-factor-stem.perl
+++ b/scripts/training/wrappers/make-factor-stem.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($size,$in,$out) = @ARGV;
diff --git a/scripts/training/wrappers/make-factor-suffix.perl b/scripts/training/wrappers/make-factor-suffix.perl
index d13c43230..7e864ea0c 100755
--- a/scripts/training/wrappers/make-factor-suffix.perl
+++ b/scripts/training/wrappers/make-factor-suffix.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
my ($size,$in,$out) = @ARGV;
diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
index f7855e06d..fc1f0c532 100755
--- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
+++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl
index b8b546953..68df07c49 100755
--- a/scripts/training/wrappers/parse-de-berkeley.perl
+++ b/scripts/training/wrappers/parse-de-berkeley.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/parse-de-bitpar.perl b/scripts/training/wrappers/parse-de-bitpar.perl
index 8cb34055c..4723d6aa0 100755
--- a/scripts/training/wrappers/parse-de-bitpar.perl
+++ b/scripts/training/wrappers/parse-de-bitpar.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/parse-en-collins.perl b/scripts/training/wrappers/parse-en-collins.perl
index 3d879c06b..27b33a2dd 100755
--- a/scripts/training/wrappers/parse-en-collins.perl
+++ b/scripts/training/wrappers/parse-en-collins.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use File::Basename;
use File::Temp qw/tempfile/;
diff --git a/scripts/training/wrappers/parse-en-egret.perl b/scripts/training/wrappers/parse-en-egret.perl
index fc330c70f..c3d23a4ee 100755
--- a/scripts/training/wrappers/parse-en-egret.perl
+++ b/scripts/training/wrappers/parse-en-egret.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl
index 2c830f6b6..1bb616939 100755
--- a/scripts/training/wrappers/syntax-hyphen-splitting.perl
+++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/scripts/training/wrappers/tagger-german-chunk.perl b/scripts/training/wrappers/tagger-german-chunk.perl
index b6b2871ba..4f26efabe 100755
--- a/scripts/training/wrappers/tagger-german-chunk.perl
+++ b/scripts/training/wrappers/tagger-german-chunk.perl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
+use warnings;
use strict;
use Getopt::Long "GetOptions";
diff --git a/symal/symal.cpp b/symal/symal.cpp
index dbe68f1b9..249aa6caa 100644
--- a/symal/symal.cpp
+++ b/symal/symal.cpp
@@ -67,7 +67,7 @@ int verbose=0;
int lc = 0;
-int getals(fstream& inp,int& m, int *a,int& n, int *b)
+int getals(istream& inp,int& m, int *a,int& n, int *b)
{
char w[MAX_WORD], dummy[10];
int i,j,freq;
@@ -121,7 +121,7 @@ int getals(fstream& inp,int& m, int *a,int& n, int *b)
//compute union alignment
-int prunionalignment(fstream& out,int m,int *a,int n,int* b)
+int prunionalignment(ostream& out,int m,int *a,int n,int* b)
{
ostringstream sout;
@@ -150,7 +150,7 @@ int prunionalignment(fstream& out,int m,int *a,int n,int* b)
//Compute intersection alignment
-int printersect(fstream& out,int m,int *a,int n,int* b)
+int printersect(ostream& out,int m,int *a,int n,int* b)
{
ostringstream sout;
@@ -174,7 +174,7 @@ int printersect(fstream& out,int m,int *a,int n,int* b)
//Compute target-to-source alignment
-int printtgttosrc(fstream& out,int m,int *a,int n,int* b)
+int printtgttosrc(ostream& out,int m,int *a,int n,int* b)
{
ostringstream sout;
@@ -198,7 +198,7 @@ int printtgttosrc(fstream& out,int m,int *a,int n,int* b)
//Compute source-to-target alignment
-int printsrctotgt(fstream& out,int m,int *a,int n,int* b)
+int printsrctotgt(ostream& out,int m,int *a,int n,int* b)
{
ostringstream sout;
@@ -226,7 +226,7 @@ int printsrctotgt(fstream& out,int m,int *a,int n,int* b)
//to represent the grow alignment as the unionalignment of a
//directed and inverted alignment
-int printgrow(fstream& out,int m,int *a,int n,int* b, bool diagonal=false,bool final=false,bool bothuncovered=false)
+int printgrow(ostream& out,int m,int *a,int n,int* b, bool diagonal=false,bool final=false,bool bothuncovered=false)
{
ostringstream sout;
@@ -392,8 +392,8 @@ int main(int argc, char** argv)
{
int alignment=0;
- char* input=(char*)"/dev/stdin";
- char* output=(char*)"/dev/stdout";
+ char* input= NULL;
+ char* output= NULL;
int diagonal=false;
int final=false;
int bothuncovered=false;
@@ -421,23 +421,29 @@ int main(int argc, char** argv)
<< "Input file or std must be in .bal format (see script giza2bal.pl).\n";
exit(1);
-
}
- fstream inp(input,ios::in);
- fstream out(output,ios::out);
+ istream *inp = &std::cin;
+ ostream *out = &std::cout;
- if (!inp.is_open()) {
- cerr << "cannot open " << input << "\n";
- exit(1);
+ if (input) {
+ fstream *fin = new fstream(input,ios::in);
+ if (!fin->is_open()) {
+ cerr << "cannot open " << input << "\n";
+ exit(1);
+ }
+ inp = fin;
}
- if (!out.is_open()) {
- cerr << "cannot open " << output << "\n";
- exit(1);
+ if (output) {
+ fstream *fout = new fstream(output,ios::out);
+ if (!fout->is_open()) {
+ cerr << "cannot open " << output << "\n";
+ exit(1);
+ }
+ out = fout;
}
-
int a[MAX_M],b[MAX_N],m,n;
fa=new int[MAX_M+1];
ea=new int[MAX_N+1];
@@ -450,16 +456,16 @@ int main(int argc, char** argv)
switch (alignment) {
case UNION:
cerr << "symal: computing union alignment\n";
- while(getals(inp,m,a,n,b)) {
- prunionalignment(out,m,a,n,b);
+ while(getals(*inp,m,a,n,b)) {
+ prunionalignment(*out,m,a,n,b);
sents++;
}
cerr << "Sents: " << sents << endl;
break;
case INTERSECT:
cerr << "symal: computing intersect alignment\n";
- while(getals(inp,m,a,n,b)) {
- printersect(out,m,a,n,b);
+ while(getals(*inp,m,a,n,b)) {
+ printersect(*out,m,a,n,b);
sents++;
}
cerr << "Sents: " << sents << endl;
@@ -469,15 +475,15 @@ int main(int argc, char** argv)
<< diagonal << ") final ("<< final << ")"
<< "both-uncovered (" << bothuncovered <<")\n";
- while(getals(inp,m,a,n,b))
- printgrow(out,m,a,n,b,diagonal,final,bothuncovered);
+ while(getals(*inp,m,a,n,b))
+ printgrow(*out,m,a,n,b,diagonal,final,bothuncovered);
break;
case TGTTOSRC:
cerr << "symal: computing target-to-source alignment\n";
- while(getals(inp,m,a,n,b)) {
- printtgttosrc(out,m,a,n,b);
+ while(getals(*inp,m,a,n,b)) {
+ printtgttosrc(*out,m,a,n,b);
sents++;
}
cerr << "Sents: " << sents << endl;
@@ -485,8 +491,8 @@ int main(int argc, char** argv)
case SRCTOTGT:
cerr << "symal: computing source-to-target alignment\n";
- while(getals(inp,m,a,n,b)) {
- printsrctotgt(out,m,a,n,b);
+ while(getals(*inp,m,a,n,b)) {
+ printsrctotgt(*out,m,a,n,b);
sents++;
}
cerr << "Sents: " << sents << endl;
@@ -500,5 +506,12 @@ int main(int argc, char** argv)
for (int i=1; i<=MAX_N; i++) delete [] A[i];
delete [] A;
+ if (inp != &std::cin) {
+ delete inp;
+ }
+ if (out != &std::cout) {
+ delete inp;
+ }
+
exit(0);
}
diff --git a/util/Jamfile b/util/Jamfile
index 18b20a33a..2d3cede01 100644
--- a/util/Jamfile
+++ b/util/Jamfile
@@ -21,7 +21,7 @@ obj file_piece_test.o : file_piece_test.cc /top//boost_unit_test_framework : $(c
fakelib parallel_read : parallel_read.cc : <threading>multi:<source>/top//boost_thread <threading>multi:<define>WITH_THREADS : : <include>.. ;
-fakelib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc parallel_read pool.cc read_compressed scoped.cc string_piece.cc usage.cc double-conversion//double-conversion : <include>.. <os>LINUX,<threading>single:<source>rt : : <include>.. ;
+fakelib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc parallel_read pool.cc random.cc read_compressed scoped.cc string_piece.cc usage.cc double-conversion//double-conversion : <include>.. <os>LINUX,<threading>single:<source>rt : : <include>.. ;
exe cat_compressed : cat_compressed_main.cc kenutil ;
@@ -32,5 +32,5 @@ import testing ;
run file_piece_test.o kenutil /top//boost_unit_test_framework : : file_piece.cc ;
for local t in [ glob *_test.cc : file_piece_test.cc read_compressed_test.cc ] {
local name = [ MATCH "(.*)\.cc" : $(t) ] ;
- unit-test $(name) : $(t) kenutil /top//boost_unit_test_framework /top//boost_system ;
+ unit-test $(name) : $(t) kenutil /top//boost_unit_test_framework /top//boost_filesystem /top//boost_system ;
}
diff --git a/util/mmap.hh b/util/mmap.hh
index 9b1e120f3..37feb5bee 100644
--- a/util/mmap.hh
+++ b/util/mmap.hh
@@ -100,9 +100,12 @@ typedef enum {
extern const int kFileFlags;
-// Wrapper around mmap to check it worked and hide some platform macros.
+// Cross-platform, error-checking wrapper for mmap().
void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
+// Cross-platform, error-checking wrapper for munmap().
+void UnmapOrThrow(void *start, size_t length);
+
void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
void MapAnonymous(std::size_t size, scoped_memory &to);
diff --git a/util/random.cc b/util/random.cc
new file mode 100644
index 000000000..4db1a61ee
--- /dev/null
+++ b/util/random.cc
@@ -0,0 +1,43 @@
+#include "util/random.hh"
+
+#include <cstdlib>
+
+#include <boost/thread/locks.hpp>
+#include <boost/thread/lock_guard.hpp>
+#include <boost/thread/mutex.hpp>
+
+namespace util
+{
+namespace
+{
+/** Lock to protect randomizer.
+ *
+ * This module is implemented in terms of rand()/srand() from <cstdlib>.
+ * These functions are standard C, but they're not thread-safe. Scalability
+ * is not worth much complexity here, so just slap a mutex around it.
+ */
+boost::mutex rand_lock;
+} // namespace
+
+void rand_init(unsigned int seed)
+{
+ boost::lock_guard<boost::mutex> lock(rand_lock);
+ srand(seed);
+}
+
+
+void rand_init()
+{
+ rand_init(time(NULL));
+}
+
+namespace internal
+{
+// This is the one call to the actual randomizer. All else is built on this.
+int rand_int()
+{
+ boost::lock_guard<boost::mutex> lock(rand_lock);
+ return std::rand();
+}
+} // namespace internal
+} // namespace util
diff --git a/util/random.hh b/util/random.hh
new file mode 100644
index 000000000..6c2773520
--- /dev/null
+++ b/util/random.hh
@@ -0,0 +1,229 @@
+#ifndef UTIL_RANDOM_H
+#define UTIL_RANDOM_H
+
+#include <cstdlib>
+#include <limits>
+
+namespace util
+{
+/** Thread-safe, cross-platform random number generator.
+ *
+ * This is not for proper security-grade randomness, but should be "good
+ * enough" for producing arbitrary values of various numeric types.
+ *
+ * Before starting, call rand_init() to seed the randomizer. There is no need
+ * to do this more than once; in fact doing it more often is likely to make the
+ * randomizer less effective. Once that is done, call the rand(), rand_excl(),
+ * and rand_incl() functions as needed to generate pseudo-random numbers.
+ *
+ * Probability distribution is roughly uniform, but for integral types is
+ * skewed slightly towards lower numbers depending on how close "top" comes to
+ * RAND_MAX.
+ *
+ * For floating-point types, resolution is limited; there will actually be
+ * only RAND_MAX different possible values.
+ */
+
+/** Initialize randomizer with a fixed seed.
+ *
+ * After this, unless the randomizer gets seeded again, consecutive calls to
+ * the random functions will return a sequence of pseudo-random numbers
+ * determined by the seed. Every time the randomizer is seeded with this same
+ * seed, it will again start returning the same sequence of numbers.
+ */
+void rand_init(unsigned int);
+
+/** Initialize randomizer based on current time.
+ *
+ * Call this to make the randomizer return hard-to-predict numbers. It won't
+ * produce high-grade randomness, but enough to make the program act
+ * differently on different runs.
+ *
+ * The seed will be based on the current time in seconds. So calling it twice
+ * within the same second will just reset the randomizer to where it was before.
+ * Don't do that.
+ */
+void rand_init();
+
+
+/** Return a pseudorandom number between 0 and RAND_MAX inclusive.
+ *
+ * Initialize (seed) the randomizer before starting to call this.
+ */
+template<typename T> inline T rand();
+
+
+/** Return a pseudorandom number in the half-open interval [bottom, top).
+ *
+ * Generates a value between "bottom" (inclusive) and "top" (exclusive),
+ * assuming that (top - bottom) <= RAND_MAX.
+ */
+template<typename T> inline T rand_excl(T bottom, T top);
+
+
+/** Return a pseudorandom number in the half-open interval [0, top).
+ *
+ * Generates a value between 0 (inclusive) and "top" (exclusive), assuming that
+ * bottom <= RAND_MAX.
+ */
+template<typename T> inline T rand_excl(T top);
+
+
+/** Return a pseudorandom number in the open interval [bottom, top].
+ *
+ * Generates a value between "bottom" and "top" inclusive, assuming that
+ * (top - bottom) < RAND_MAX.
+ */
+template<typename T> inline T rand_incl(T bottom, T top);
+
+
+/** Return a pseudorandom number in the open interval [0, top].
+ *
+ * Generates a value between 0 and "top" inclusive, assuming that
+ * bottom < RAND_MAX.
+ */
+template<typename T> inline T rand_incl(T top);
+
+
+/** Return a pseudorandom number which may be larger than RAND_MAX.
+ *
+ * The requested type must be integral, and its size must be an even multiple
+ * of the size of an int. The return value will combine one or more random
+ * ints into a single value, which could get quite large.
+ *
+ * The result is nonnegative. Because the constituent ints are also
+ * nonnegative, the most significant bit in each of the ints will be zero,
+ * so for a wider type, there will be "gaps" in the range of possible outputs.
+ */
+template<typename T> inline T wide_rand();
+
+/** Return a pseudorandom number in [0, top), not limited to RAND_MAX.
+ *
+ * Works like wide_rand(), but if the requested type is wider than an int, it
+ * accommodates larger top values than an int can represent.
+ */
+template<typename T> inline T wide_rand_excl(T top);
+
+/** Return a pseudorandom number in [bottom, top), not limited to RAND_MAX.
+ *
+ * Works like wide_rand(), but if the requested type is wider than an int, it
+ * accommodates larger value ranges than an int can represent.
+ */
+template<typename T> inline T wide_rand_excl(T bottom, T top);
+
+/** Return a pseudorandom number in [0, top], not limited to RAND_MAX.
+ *
+ * Works like wide_rand(), but if the requested type is wider than an int, it
+ * accommodates larger top values than an int can represent.
+ */
+template<typename T> inline T wide_rand_incl(T top);
+
+/** Return a pseudorandom number in [bottom, top], not limited to RAND_MAX.
+ *
+ * Works like wide_rand(), but if the requested type is wider than an int, it
+ * accommodates larger top values than an int can represent.
+ */
+template<typename T> inline T wide_rand_incl(T bottom, T top);
+
+
+/// Implementation detail. For the random module's internal use only.
+namespace internal
+{
+/// The central call to the randomizer upon which this whole module is built.
+int rand_int();
+
+/// Helper template: customize random values to required ranges.
+template<typename T, bool is_integer_type> struct random_scaler;
+
+/// Specialized random_scaler for integral types.
+template<typename T> struct random_scaler<T, true>
+{
+ static T rnd_excl(T value, T range) { return value % range; }
+ static T rnd_incl(T value, T range) { return value % (range + 1); }
+};
+
+/// Specialized random_scaler for non-integral types.
+template<typename T> struct random_scaler<T, false>
+{
+ static T rnd_excl(T value, T range)
+ {
+ // Promote RAND_MAX to T before adding one to avoid overflow.
+ return range * value / (T(RAND_MAX) + 1);
+ }
+ static T rnd_incl(T value, T range) { return range * value / RAND_MAX; }
+};
+
+/// Helper for filling a wider variable with random ints.
+template<typename T, size_t remaining_ints> struct wide_random_collector
+{
+ static T generate()
+ {
+ T one_int = util::rand<T>() << (8 * sizeof(int));
+ return one_int | wide_random_collector<T, remaining_ints-1>::generate();
+ }
+};
+/// Specialized wide_random_collector for generating just a single int.
+template<typename T> struct wide_random_collector<T, 1>
+{
+ static T generate() { return util::rand<T>(); }
+};
+
+} // namespace internal
+
+
+template<typename T> inline T rand()
+{
+ return T(util::internal::rand_int());
+}
+
+template<typename T> inline T rand_excl(T top)
+{
+ typedef internal::random_scaler<T, std::numeric_limits<T>::is_integer> scaler;
+ return scaler::rnd_excl(util::rand<T>(), top);
+}
+
+template<typename T> inline T rand_excl(T bottom, T top)
+{
+ return bottom + rand_excl(top - bottom);
+}
+
+template<typename T> inline T rand_incl(T top)
+{
+ typedef internal::random_scaler<T, std::numeric_limits<T>::is_integer> scaler;
+ return scaler::rnd_incl(util::rand<T>(), top);
+}
+
+template<typename T> inline T rand_incl(T bottom, T top)
+{
+ return bottom + rand_incl(top - bottom);
+}
+
+template<typename T> inline T wide_rand()
+{
+ return internal::wide_random_collector<T, sizeof(T)/sizeof(int)>::generate();
+}
+
+template<typename T> inline T wide_rand_excl(T top)
+{
+ typedef internal::random_scaler<T, std::numeric_limits<T>::is_integer> scaler;
+ return scaler::rnd_excl(util::wide_rand<T>(), top);
+}
+
+template<typename T> inline T wide_rand_excl(T bottom, T top)
+{
+ return bottom + wide_rand_excl(top - bottom);
+}
+
+template<typename T> inline T wide_rand_incl(T top)
+{
+ typedef internal::random_scaler<T, std::numeric_limits<T>::is_integer> scaler;
+ return scaler::rnd_incl(util::wide_rand<T>(), top);
+}
+
+template<typename T> inline T wide_rand_incl(T bottom, T top)
+{
+ return bottom + wide_rand_incl(top - bottom);
+}
+} // namespace util
+
+#endif
diff --git a/util/random_test.cc b/util/random_test.cc
new file mode 100644
index 000000000..6d8981de8
--- /dev/null
+++ b/util/random_test.cc
@@ -0,0 +1,191 @@
+#include <cstdlib>
+
+#include "util/random.hh"
+
+#define BOOST_TEST_MODULE RandomTest
+#include <boost/test/unit_test.hpp>
+
+namespace util
+{
+namespace
+{
+
+BOOST_AUTO_TEST_CASE(rand_int_returns_positive_no_greater_than_RAND_MAX)
+{
+ rand_init();
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand<int>();
+ BOOST_CHECK(random_number >= 0);
+ BOOST_CHECK(random_number <= RAND_MAX);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_int_returns_different_consecutive_numbers)
+{
+ rand_init(99);
+ const int first = rand<int>(), second = rand<int>(), third = rand<int>();
+ // Sometimes you'll get the same number twice in a row, but generally the
+ // randomizer returns different numbers.
+ BOOST_CHECK(second != first || third != first);
+}
+
+BOOST_AUTO_TEST_CASE(rand_int_returns_different_numbers_for_different_seeds)
+{
+ rand_init(1);
+ const int one1 = rand<int>(), one2 = rand<int>();
+ rand_init(2);
+ const int two1 = rand<int>(), two2 = rand<int>();
+ BOOST_CHECK(two1 != one1 || two2 != one2);
+}
+
+BOOST_AUTO_TEST_CASE(rand_int_returns_same_sequence_for_same_seed)
+{
+ rand_init(1);
+ const int first = rand<int>();
+ rand_init(1);
+ const int second = rand<int>();
+ BOOST_CHECK_EQUAL(first, second);
+}
+
+BOOST_AUTO_TEST_CASE(rand_excl_int_returns_number_in_range)
+{
+ const int bottom = 10, top = 50;
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand_excl(bottom, top);
+ BOOST_CHECK(random_number >= bottom);
+ BOOST_CHECK(random_number < top);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_excl_int_covers_full_range)
+{
+ // The spread of random numbers really goes all the way from 0 (inclusive)
+ // to "top" (exclusive). It's not some smaller subset.
+ // This test will randomly fail sometimes, though very very rarely, when the
+ // random numbers don't actually have enough different values.
+ const int bottom = 1, top = 4;
+ int lowest = 99, highest = -1;
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand_excl(bottom, top);
+ lowest = std::min(lowest, random_number);
+ highest = std::max(highest, random_number);
+ }
+
+ BOOST_CHECK_EQUAL(lowest, bottom);
+ BOOST_CHECK_EQUAL(highest, top - 1);
+}
+
+BOOST_AUTO_TEST_CASE(rand_incl_int_returns_number_in_range)
+{
+ const int bottom = 10, top = 50;
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand_incl(bottom, top);
+ BOOST_CHECK(random_number >= 0);
+ BOOST_CHECK(random_number <= top);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_incl_int_covers_full_range)
+{
+ // The spread of random numbers really goes all the way from 0 to "top"
+ // inclusive. It's not some smaller subset.
+ // This test will randomly fail sometimes, though very very rarely, when the
+ // random numbers don't actually have enough different values.
+ const int bottom = 1, top = 4;
+ int lowest = 99, highest = -1;
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = rand_incl(bottom, top);
+ lowest = std::min(lowest, random_number);
+ highest = std::max(highest, random_number);
+ }
+
+ BOOST_CHECK_EQUAL(lowest, bottom);
+ BOOST_CHECK_EQUAL(highest, top);
+}
+
+BOOST_AUTO_TEST_CASE(rand_excl_float_returns_float_in_range)
+{
+ const float bottom = 5, top = 10;
+ for (int i=0; i<100; i++)
+ {
+ const float random_number = rand_excl(bottom, top);
+ BOOST_CHECK(random_number >= bottom);
+ BOOST_CHECK(random_number < top);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_excl_float_returns_different_values)
+{
+ const float bottom = 5, top = 10;
+ float lowest = 99, highest = -1;
+ for (int i=0; i<10; i++)
+ {
+ const float random_number = rand_excl(bottom, top);
+ lowest = std::min(lowest, random_number);
+ highest = std::max(highest, random_number);
+ }
+ BOOST_CHECK(lowest < highest);
+}
+
+BOOST_AUTO_TEST_CASE(rand_float_incl_returns_float_in_range)
+{
+ const float bottom = 5, top = 10;
+ for (int i=0; i<1000; i++)
+ {
+ const float random_number = rand_excl(bottom, top);
+ BOOST_CHECK(random_number >= bottom);
+ BOOST_CHECK(random_number <= top);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(rand_float_incl_returns_different_values)
+{
+ const float bottom = 0, top = 10;
+ float lowest = 99, highest = -1;
+ for (int i=0; i<10; i++)
+ {
+ const float random_number = rand_excl(bottom, top);
+ lowest = std::min(lowest, random_number);
+ highest = std::max(highest, random_number);
+ }
+ BOOST_CHECK(lowest < highest);
+}
+
+BOOST_AUTO_TEST_CASE(wide_rand_int_returns_different_numbers_in_range)
+{
+ for (int i=0; i<100; i++)
+ {
+ const int random_number = wide_rand<int>();
+ BOOST_CHECK(random_number >= 0);
+ BOOST_CHECK(random_number <= RAND_MAX);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(wide_rand_long_long_returns_big_numbers)
+{
+ long long one = wide_rand<long long>(), two = wide_rand<long long>();
+ // This test will fail sometimes because of unlucky random numbers, but only
+ // very very rarely.
+ BOOST_CHECK(one > RAND_MAX || two > RAND_MAX);
+}
+
+BOOST_AUTO_TEST_CASE(wide_rand_excl_supports_larger_range)
+{
+ const long long top = 1000 * (long long)RAND_MAX;
+ long long
+ one = wide_rand_excl<long long>(top),
+ two = wide_rand_excl<long long>(top);
+ BOOST_CHECK(one < top);
+ BOOST_CHECK(two < top);
+ // This test will fail sometimes because of unlucky random numbers, but only
+ // very very rarely.
+ BOOST_CHECK(one > RAND_MAX || two > RAND_MAX);
+}
+
+} // namespace
+} // namespace util
diff --git a/util/tempfile.hh b/util/tempfile.hh
new file mode 100644
index 000000000..9b872a27e
--- /dev/null
+++ b/util/tempfile.hh
@@ -0,0 +1,151 @@
+#ifndef UTIL_TEMPFILE_H
+#define UTIL_TEMPFILE_H
+
+// Utilities for creating temporary files and directories.
+
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#endif
+
+#include <boost/filesystem.hpp>
+#include <boost/noncopyable.hpp>
+
+#include "util/exception.hh"
+#include "util/unistd.hh"
+
+namespace util
+{
+
+/// Obtain a directory for temporary files, e.g. /tmp.
+std::string temp_location()
+{
+#if defined(_WIN32) || defined(_WIN64)
+ char dir_buffer[1000];
+ if (GetTempPath(1000, dir_buffer) == 0)
+ throw std::runtime_error("Could not read temporary directory.");
+ return std::string(dir_buffer);
+#else
+ // POSIX says to try these environment variables, in this order:
+ const char *const vars[] = {"TMPDIR", "TMP", "TEMPDIR", "TEMP", 0};
+ for (int i=0; vars[i]; ++i)
+ {
+ const char *val = getenv(vars[i]);
+ // Environment variable is set and nonempty. Use it.
+ if (val && *val) return val;
+ }
+ // No environment variables set. Default to /tmp.
+ return "/tmp";
+#endif
+}
+
+
+#if defined(_WIN32) || defined(_WIN64)
+/// Windows helper: create temporary filename.
+std::string windows_tmpnam()
+{
+ const std::string tmp = temp_location();
+ char output_buffer[MAX_PATH];
+ if (GetTempFileName(tmp.c_str(), "tmp", 0, output_buffer) == 0)
+ throw std::runtime_error("Could not create temporary file name.");
+ return output_buffer;
+}
+#else
+/** POSIX helper: create template for temporary filename.
+ *
+ * Writes the template into buf, which must have room for at least PATH_MAX
+ * bytes. The function fails if the template is too long.
+ */
+void posix_tmp_template(char *buf)
+{
+ const std::string tmp = temp_location();
+ const std::string name_template = tmp + "/tmp.XXXXXX";
+ if (name_template.size() >= PATH_MAX-1)
+ throw std::runtime_error("Path for temp files is too long: " + tmp);
+ strcpy(buf, name_template.c_str());
+}
+#endif
+
+
+/** Temporary directory.
+ *
+ * Automatically creates, and on destruction deletes, a temporary directory.
+ * The actual directory in the filesystem will only exist while the temp_dir
+ * object exists.
+ *
+ * If the directory no longer exists by the time the temp_dir is destroyed,
+ * cleanup is skipped.
+ */
+class temp_dir : boost::noncopyable
+{
+public:
+ temp_dir()
+ {
+#if defined(_WIN32) || defined(_WIN64)
+ m_path = windows_tmpnam();
+ boost::filesystem::create_directory(m_path);
+#else
+ char buf[PATH_MAX];
+ posix_tmp_template(buf);
+ m_path = std::string(mkdtemp(buf));
+#endif
+ }
+
+ ~temp_dir()
+ {
+ boost::filesystem::remove_all(path());
+ }
+
+ /// Return the temporary directory's full path.
+ const std::string &path() const { return m_path; }
+
+private:
+ std::string m_path;
+};
+
+
+/** Temporary file.
+ *
+ * Automatically creates, and on destruction deletes, a temporary file.
+ *
+ * If the file no longer exists by the time the temp_file is destroyed,
+ * cleanup is skipped.
+ */
+class temp_file : boost::noncopyable
+{
+public:
+ temp_file()
+ {
+#if defined(_WIN32) || defined(_WIN64)
+ m_path = windows_tmpnam();
+ std::ofstream out(m_path.c_str());
+ out.flush();
+#else
+ char buf[PATH_MAX];
+ posix_tmp_template(buf);
+ const int fd = mkstemp(buf);
+ if (fd == -1) throw ErrnoException();
+ close(fd);
+ m_path = buf;
+#endif
+ }
+
+ ~temp_file()
+ {
+ boost::filesystem::remove(path());
+ }
+
+ /// Return the temporary file's full path.
+ const std::string &path() const { return m_path; }
+
+private:
+ std::string m_path;
+};
+
+} // namespace util
+
+#endif
diff --git a/util/tempfile_test.cc b/util/tempfile_test.cc
new file mode 100644
index 000000000..49736fe0c
--- /dev/null
+++ b/util/tempfile_test.cc
@@ -0,0 +1,119 @@
+#include "util/tempfile.hh"
+
+#include <fstream>
+
+#include <boost/filesystem.hpp>
+
+#define BOOST_TEST_MODULE TempFileTest
+#include <boost/test/unit_test.hpp>
+
+namespace util
+{
+namespace
+{
+
+BOOST_AUTO_TEST_CASE(temp_dir_has_path)
+{
+ BOOST_CHECK(temp_dir().path().size() > 0);
+}
+
+BOOST_AUTO_TEST_CASE(temp_dir_creates_temp_directory)
+{
+ const temp_dir t;
+ BOOST_CHECK(boost::filesystem::exists(t.path()));
+ BOOST_CHECK(boost::filesystem::is_directory(t.path()));
+}
+
+BOOST_AUTO_TEST_CASE(temp_dir_creates_unique_directory)
+{
+ BOOST_CHECK(temp_dir().path() != temp_dir().path());
+}
+
+BOOST_AUTO_TEST_CASE(temp_dir_cleans_up_directory)
+{
+ std::string path;
+ {
+ const temp_dir t;
+ path = t.path();
+ }
+ BOOST_CHECK(!boost::filesystem::exists(path));
+}
+
+BOOST_AUTO_TEST_CASE(temp_dir_cleanup_succeeds_if_directory_contains_file)
+{
+ std::string path;
+ {
+ const temp_dir t;
+ path = t.path();
+ boost::filesystem::create_directory(path + "/directory");
+ std::ofstream file((path + "/file").c_str());
+ file << "Text";
+ file.flush();
+ }
+ BOOST_CHECK(!boost::filesystem::exists(path));
+}
+
+BOOST_AUTO_TEST_CASE(temp_dir_cleanup_succeeds_if_directory_is_gone)
+{
+ std::string path;
+ {
+ const temp_dir t;
+ path = t.path();
+ boost::filesystem::remove_all(path);
+ }
+ BOOST_CHECK(!boost::filesystem::exists(path));
+}
+
+BOOST_AUTO_TEST_CASE(temp_file_has_path)
+{
+ BOOST_CHECK(temp_file().path().size() > 0);
+}
+
+BOOST_AUTO_TEST_CASE(temp_file_creates_temp_file)
+{
+ const temp_file f;
+ BOOST_CHECK(boost::filesystem::exists(f.path()));
+ BOOST_CHECK(boost::filesystem::is_regular_file(f.path()));
+}
+
+BOOST_AUTO_TEST_CASE(temp_file_creates_unique_file)
+{
+ BOOST_CHECK(temp_file().path() != temp_file().path());
+}
+
+BOOST_AUTO_TEST_CASE(temp_file_creates_writable_file)
+{
+ const std::string data = "Test-data-goes-here";
+ const temp_file f;
+ std::ofstream outfile(f.path().c_str());
+ outfile << data;
+ outfile.flush();
+ std::string read_data;
+ std::ifstream infile(f.path().c_str());
+ infile >> read_data;
+ BOOST_CHECK_EQUAL(data, read_data);
+}
+
+BOOST_AUTO_TEST_CASE(temp_file_cleans_up_file)
+{
+ std::string path;
+ {
+ const temp_file f;
+ path = f.path();
+ }
+ BOOST_CHECK(!boost::filesystem::exists(path));
+}
+
+BOOST_AUTO_TEST_CASE(temp_file_cleanup_succeeds_if_file_is_gone)
+{
+ std::string path;
+ {
+ const temp_file t;
+ path = t.path();
+ boost::filesystem::remove(path);
+ }
+ BOOST_CHECK(!boost::filesystem::exists(path));
+}
+
+} // namespace anonymous
+} // namespace util
diff --git a/util/tokenize.hh b/util/tokenize.hh
new file mode 100644
index 000000000..5d8430222
--- /dev/null
+++ b/util/tokenize.hh
@@ -0,0 +1,51 @@
+#ifndef TOKENIZE_H
+#define TOKENIZE_H
+
+#include <string>
+#include <vector>
+
+namespace util
+{
+
+/** Split input text into a series of tokens.
+ *
+ * Splits on spaces and tabs, no other whitespace characters, and is not
+ * locale-sensitive.
+ *
+ * The spaces themselves are not included. A sequence of consecutive space/tab
+ * characters count as one.
+ */
+inline std::vector<std::string> tokenize(const char input[])
+{
+ std::vector<std::string> token;
+ bool betweenWords = true;
+ int start = 0;
+ int i;
+ for(i = 0; input[i] != '\0'; i++) {
+ const bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+ if (!isSpace && betweenWords) {
+ start = i;
+ betweenWords = false;
+ } else if (isSpace && !betweenWords) {
+ token.push_back( std::string( input+start, i-start ) );
+ betweenWords = true;
+ }
+ }
+ if (!betweenWords)
+ token.push_back( std::string( input+start, i-start ) );
+ return token;
+}
+
+/** Split input string into a series of tokens.
+ *
+ * Like tokenize(const char[]), but takes a std::string.
+ */
+inline std::vector<std::string> tokenize(const std::string &input)
+{
+ return tokenize(input.c_str());
+}
+
+} // namespace util
+
+#endif
diff --git a/util/tokenize_test.cc b/util/tokenize_test.cc
new file mode 100644
index 000000000..d879fa97f
--- /dev/null
+++ b/util/tokenize_test.cc
@@ -0,0 +1,69 @@
+#include "util/tokenize.hh"
+
+#define BOOST_TEST_MODULE TokenizeTest
+#include <boost/test/unit_test.hpp>
+
+namespace util
+{
+namespace
+{
+
+BOOST_AUTO_TEST_CASE(empty_text_yields_empty_vector)
+{
+ const std::vector<std::string> tokens = util::tokenize("");
+ BOOST_CHECK_EQUAL(tokens.size(), 0);
+}
+
+BOOST_AUTO_TEST_CASE(whitespace_only_yields_empty_vector)
+{
+ const std::vector<std::string> tokens = util::tokenize(" ");
+ BOOST_CHECK_EQUAL(tokens.size(), 0);
+}
+
+BOOST_AUTO_TEST_CASE(parses_single_token)
+{
+ const std::vector<std::string> tokens = util::tokenize("mytoken");
+ BOOST_CHECK_EQUAL(tokens.size(), 1);
+ BOOST_CHECK_EQUAL(tokens[0], "mytoken");
+}
+
+BOOST_AUTO_TEST_CASE(ignores_leading_whitespace)
+{
+ const std::vector<std::string> tokens = util::tokenize(" \t mytoken");
+ BOOST_CHECK_EQUAL(tokens.size(), 1);
+ BOOST_CHECK_EQUAL(tokens[0], "mytoken");
+}
+
+BOOST_AUTO_TEST_CASE(ignores_trailing_whitespace)
+{
+ const std::vector<std::string> tokens = util::tokenize("mytoken \t ");
+ BOOST_CHECK_EQUAL(tokens.size(), 1);
+ BOOST_CHECK_EQUAL(tokens[0], "mytoken");
+}
+
+BOOST_AUTO_TEST_CASE(splits_tokens_on_tabs)
+{
+ const std::vector<std::string> tokens = util::tokenize("one\ttwo");
+ BOOST_CHECK_EQUAL(tokens.size(), 2);
+ BOOST_CHECK_EQUAL(tokens[0], "one");
+ BOOST_CHECK_EQUAL(tokens[1], "two");
+}
+
+BOOST_AUTO_TEST_CASE(splits_tokens_on_spaces)
+{
+ const std::vector<std::string> tokens = util::tokenize("one two");
+ BOOST_CHECK_EQUAL(tokens.size(), 2);
+ BOOST_CHECK_EQUAL(tokens[0], "one");
+ BOOST_CHECK_EQUAL(tokens[1], "two");
+}
+
+BOOST_AUTO_TEST_CASE(treats_sequence_of_space_as_one_space)
+{
+ const std::vector<std::string> tokens = util::tokenize("one\t \ttwo");
+ BOOST_CHECK_EQUAL(tokens.size(), 2);
+ BOOST_CHECK_EQUAL(tokens[0], "one");
+ BOOST_CHECK_EQUAL(tokens[1], "two");
+}
+
+} // namespace
+} // namespace util
diff --git a/util/unistd.hh b/util/unistd.hh
index 0379c4914..f99be592a 100644
--- a/util/unistd.hh
+++ b/util/unistd.hh
@@ -1,7 +1,7 @@
#ifndef UTIL_UNISTD_H
#define UTIL_UNISTD_H
-#if defined(_WIN32) || defined(_WIN64)
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
// Windows doesn't define <unistd.h>
//