Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUlrich Germann <ugermann@inf.ed.ac.uk>2014-11-20 20:55:51 +0300
committerUlrich Germann <ugermann@inf.ed.ac.uk>2014-11-20 20:55:51 +0300
commit7aa4d5d8d523e02fd387aed5fca34177672122ae (patch)
treea92cb9169a6142877c0f495ba4a57294b1977f4e
parent07202c544c17d508de07415092ead7b664f41016 (diff)
parent271141071265e6a535fdf26bc01e072d8d8d5e80 (diff)
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder
Conflicts: moses-cmd/simulate-pe.cc
-rw-r--r--.gitignore1
-rw-r--r--BUILD-INSTRUCTIONS.txt1
-rw-r--r--Jamroot33
-rw-r--r--contrib/combine-ptables/README.md8
-rwxr-xr-xcontrib/combine-ptables/combine-ptables.pl30
-rw-r--r--contrib/other-builds/CreateOnDiskPt/.cproject16
-rw-r--r--contrib/other-builds/OnDiskPt/.cproject8
-rw-r--r--contrib/other-builds/consolidate/.cproject29
-rw-r--r--contrib/other-builds/consolidate/.project3
-rw-r--r--contrib/other-builds/extract-ghkm/.cproject2
-rw-r--r--contrib/other-builds/extract-mixed-syntax/.cproject11
-rw-r--r--contrib/other-builds/extract-mixed-syntax/.project1
-rw-r--r--contrib/other-builds/extract-rules/.cproject11
-rw-r--r--contrib/other-builds/extract/.cproject2
-rw-r--r--contrib/other-builds/extractor/.cproject5
-rw-r--r--contrib/other-builds/manual-label/.cproject11
-rw-r--r--contrib/other-builds/moses-chart-cmd.vcxproj115
-rw-r--r--contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj486
-rw-r--r--contrib/other-builds/moses-chart-cmd/.project125
-rw-r--r--contrib/other-builds/moses-cmd/.cproject9
-rw-r--r--contrib/other-builds/moses-cmd/.project20
-rw-r--r--contrib/other-builds/moses/.cproject48
-rw-r--r--contrib/other-builds/moses/.project395
-rw-r--r--contrib/other-builds/score/.cproject7
-rw-r--r--contrib/other-builds/server/.cproject (renamed from contrib/other-builds/moses-chart-cmd/.cproject)162
-rw-r--r--contrib/other-builds/server/.project39
-rw-r--r--contrib/rephraser/Jamfile1
-rw-r--r--contrib/rephraser/paraphrase.cpp148
-rw-r--r--contrib/server/Jamfile2
-rw-r--r--contrib/server/mosesserver.cpp17
-rw-r--r--contrib/sigtest-filter/Makefile2
-rw-r--r--contrib/sigtest-filter/filter-pt.cpp429
-rw-r--r--contrib/tmcombine/test/model5/model/lex.counts.e2f8
-rw-r--r--contrib/tmcombine/test/model5/model/lex.counts.f2e8
-rw-r--r--contrib/tmcombine/test/model5/model/lex.e2f8
-rw-r--r--contrib/tmcombine/test/model5/model/lex.f2e8
-rw-r--r--contrib/tmcombine/test/model5/model/phrase-table8
-rw-r--r--contrib/tmcombine/test/model6/model/lex.counts.e2f8
-rw-r--r--contrib/tmcombine/test/model6/model/lex.counts.f2e8
-rw-r--r--contrib/tmcombine/test/model6/model/lex.e2f8
-rw-r--r--contrib/tmcombine/test/model6/model/lex.f2e8
-rw-r--r--contrib/tmcombine/test/model6/model/phrase-table5
-rw-r--r--contrib/tmcombine/test/phrase-table_test119
-rwxr-xr-xcontrib/tmcombine/tmcombine.py10
-rw-r--r--jam-files/sanity.jam6
-rw-r--r--lm/Jamfile2
-rw-r--r--lm/builder/adjust_counts.cc50
-rw-r--r--lm/builder/adjust_counts.hh28
-rw-r--r--lm/builder/adjust_counts_test.cc5
-rw-r--r--lm/builder/initial_probabilities.cc7
-rw-r--r--lm/builder/interpolate.cc94
-rw-r--r--lm/builder/interpolate.hh3
-rw-r--r--lm/builder/lmplz_main.cc36
-rw-r--r--lm/builder/pipeline.cc4
-rw-r--r--lm/builder/pipeline.hh9
-rw-r--r--lm/builder/print.cc2
-rw-r--r--mert/FeatureData.h5
-rw-r--r--mert/MeteorScorer.cpp2
-rw-r--r--mert/MiraWeightVector.cpp2
-rw-r--r--mert/ScoreData.h3
-rw-r--r--mert/kbmira.cpp7
-rw-r--r--mira/Decoder.cpp4
-rw-r--r--misc/prunePhraseTable.cpp100
-rw-r--r--moses-chart-cmd/IOWrapper.h151
-rw-r--r--moses-chart-cmd/Jamfile2
-rw-r--r--moses-chart-cmd/Main.cpp385
-rw-r--r--moses-chart-cmd/Main.h45
-rw-r--r--moses-chart-cmd/mbr.cpp172
-rw-r--r--moses-chart-cmd/mbr.h33
-rw-r--r--moses-cmd/IOWrapper.cpp661
-rw-r--r--moses-cmd/IOWrapper.h164
-rw-r--r--moses-cmd/Jamfile2
-rw-r--r--moses-cmd/LatticeMBRGrid.cpp16
-rw-r--r--moses-cmd/Main.cpp524
-rw-r--r--moses-cmd/Main.h5
-rw-r--r--moses-cmd/simulate-pe.cc774
-rw-r--r--moses/AlignmentInfo.cpp16
-rw-r--r--moses/AlignmentInfo.h11
-rw-r--r--moses/AlignmentInfoCollection.cpp12
-rw-r--r--moses/AlignmentInfoCollection.h12
-rw-r--r--moses/ChartManager.cpp5
-rw-r--r--moses/ChartManager.h6
-rw-r--r--moses/ChartParser.cpp3
-rw-r--r--moses/FF/DecodeFeature.cpp2
-rw-r--r--moses/FF/DecodeFeature.h3
-rw-r--r--moses/FF/ExternalFeature.h1
-rw-r--r--moses/FF/Factory.cpp21
-rw-r--r--moses/FF/GlobalLexicalModel.cpp10
-rw-r--r--moses/FF/GlobalLexicalModel.h24
-rw-r--r--moses/FF/InternalTree.cpp4
-rw-r--r--moses/FF/PhraseLengthFeature.h5
-rw-r--r--moses/FF/PhrasePairFeature.cpp46
-rw-r--r--moses/FF/PhrasePairFeature.h29
-rw-r--r--moses/FF/PhrasePenalty.h3
-rw-r--r--moses/FF/SourceWordDeletionFeature.cpp5
-rw-r--r--moses/FF/StatefulFeatureFunction.h7
-rw-r--r--moses/FF/StatelessFeatureFunction.h5
-rw-r--r--moses/FF/SyntaxRHS.cpp8
-rw-r--r--moses/FF/SyntaxRHS.h7
-rw-r--r--moses/FF/TargetWordInsertionFeature.cpp6
-rw-r--r--moses/FF/UnknownWordPenaltyProducer.h3
-rw-r--r--moses/FF/WordPenaltyProducer.h3
-rw-r--r--moses/FF/WordTranslationFeature.cpp46
-rw-r--r--moses/FF/WordTranslationFeature.h17
-rw-r--r--moses/HypergraphOutput.cpp4
-rw-r--r--moses/IOWrapper.cpp (renamed from moses-chart-cmd/IOWrapper.cpp)1501
-rw-r--r--moses/IOWrapper.h279
-rw-r--r--moses/Jamfile16
-rw-r--r--moses/LM/BilingualLM.cpp465
-rw-r--r--moses/LM/BilingualLM.h142
-rw-r--r--moses/LM/Implementation.cpp10
-rw-r--r--moses/LM/Jamfile33
-rw-r--r--moses/LM/Ken.cpp54
-rw-r--r--moses/LM/Ken.h2
-rw-r--r--moses/LM/NeuralLMWrapper.cpp10
-rw-r--r--moses/LM/NeuralLMWrapper.h1
-rw-r--r--moses/LM/bilingual-lm/BiLM_NPLM.cpp138
-rw-r--r--moses/LM/bilingual-lm/BiLM_NPLM.h49
-rw-r--r--moses/LM/oxlm/Mapper.cpp67
-rw-r--r--moses/LM/oxlm/Mapper.h46
-rw-r--r--moses/LM/oxlm/OxLM.cpp (renamed from moses/LM/oxlm/LBLLM.cpp)65
-rw-r--r--moses/LM/oxlm/OxLM.h (renamed from moses/LM/oxlm/LBLLM.h)31
-rw-r--r--moses/LM/oxlm/OxLMMapper.cpp47
-rw-r--r--moses/LM/oxlm/OxLMMapper.h35
-rw-r--r--moses/LM/oxlm/OxLMParallelMapper.cpp40
-rw-r--r--moses/LM/oxlm/OxLMParallelMapper.h21
-rw-r--r--moses/LM/oxlm/SourceOxLM.cpp137
-rw-r--r--moses/LM/oxlm/SourceOxLM.h49
-rw-r--r--moses/LatticeMBR.cpp3
-rw-r--r--moses/LatticeMBR.h2
-rw-r--r--moses/Manager.cpp21
-rw-r--r--moses/Manager.h5
-rw-r--r--moses/MockHypothesis.cpp2
-rw-r--r--moses/PDTAimp.cpp1
-rw-r--r--moses/Parameter.cpp10
-rw-r--r--moses/Parameter.h7
-rw-r--r--moses/StaticData.cpp14
-rw-r--r--moses/StaticData.h13
-rw-r--r--moses/Syntax/BoundedPriorityContainer.h164
-rw-r--r--moses/Syntax/Cube.cpp138
-rw-r--r--moses/Syntax/Cube.h58
-rw-r--r--moses/Syntax/CubeQueue.cpp37
-rw-r--r--moses/Syntax/CubeQueue.h50
-rw-r--r--moses/Syntax/KBestExtractor.cpp317
-rw-r--r--moses/Syntax/KBestExtractor.h118
-rw-r--r--moses/Syntax/NonTerminalMap.h71
-rw-r--r--moses/Syntax/PHyperedge.h22
-rw-r--r--moses/Syntax/PVertex.h21
-rw-r--r--moses/Syntax/RuleTable.h24
-rw-r--r--moses/Syntax/RuleTableFF.cpp51
-rw-r--r--moses/Syntax/RuleTableFF.h50
-rw-r--r--moses/Syntax/S2T/DerivationWriter.cpp100
-rw-r--r--moses/Syntax/S2T/DerivationWriter.h38
-rw-r--r--moses/Syntax/S2T/Manager-inl.h387
-rw-r--r--moses/Syntax/S2T/Manager.h69
-rw-r--r--moses/Syntax/S2T/OovHandler-inl.h107
-rw-r--r--moses/Syntax/S2T/OovHandler.h49
-rw-r--r--moses/Syntax/S2T/PChart.cpp34
-rw-r--r--moses/Syntax/S2T/PChart.h89
-rw-r--r--moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h44
-rw-r--r--moses/Syntax/S2T/ParserCallback.h83
-rw-r--r--moses/Syntax/S2T/Parsers/Parser.h30
-rw-r--r--moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h164
-rw-r--r--moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h61
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h185
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h71
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp190
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h65
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h27
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h21
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp160
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h41
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h32
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp131
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h41
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h95
-rw-r--r--moses/Syntax/S2T/RuleTrie.h40
-rw-r--r--moses/Syntax/S2T/RuleTrieCYKPlus.cpp151
-rw-r--r--moses/Syntax/S2T/RuleTrieCYKPlus.h89
-rw-r--r--moses/Syntax/S2T/RuleTrieCreator.h33
-rw-r--r--moses/Syntax/S2T/RuleTrieLoader.cpp156
-rw-r--r--moses/Syntax/S2T/RuleTrieLoader.h31
-rw-r--r--moses/Syntax/S2T/RuleTrieScope3.cpp153
-rw-r--r--moses/Syntax/S2T/RuleTrieScope3.h106
-rw-r--r--moses/Syntax/S2T/SChart.cpp20
-rw-r--r--moses/Syntax/S2T/SChart.h50
-rw-r--r--moses/Syntax/SHyperedge.cpp59
-rw-r--r--moses/Syntax/SHyperedge.h28
-rw-r--r--moses/Syntax/SHyperedgeBundle.h30
-rw-r--r--moses/Syntax/SHyperedgeBundleScorer.h28
-rw-r--r--moses/Syntax/SVertex.cpp28
-rw-r--r--moses/Syntax/SVertex.h31
-rw-r--r--moses/Syntax/SVertexRecombinationOrderer.h38
-rw-r--r--moses/Syntax/SVertexStack.h28
-rw-r--r--moses/Syntax/SymbolEqualityPred.h24
-rw-r--r--moses/Syntax/SymbolHasher.h25
-rw-r--r--moses/TargetPhrase.cpp24
-rw-r--r--moses/TargetPhrase.h21
-rw-r--r--moses/ThreadPool.cpp5
-rw-r--r--moses/TranslationModel/BilingualDynSuffixArray.cpp6
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp1
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp1
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp7
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h4
-rw-r--r--moses/TranslationModel/CompactPT/MurmurHash3.cpp2
-rw-r--r--moses/TranslationModel/CompactPT/PhraseTableCreator.cpp4
-rw-r--r--moses/TranslationModel/DynSAInclude/params.cpp3
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp2
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModelCounts.h2
-rw-r--r--moses/TranslationModel/RuleTable/LoaderStandard.cpp3
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp2
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp3
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp2
-rw-r--r--moses/TranslationModel/UG/mm/ug_mm_2d_table.h23
-rw-r--r--moses/TranslationModel/UG/mmsapt.cpp1
-rw-r--r--moses/TranslationModel/UG/sim-pe.cc3
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp11
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h4
-rw-r--r--moses/TranslationTask.cpp434
-rw-r--r--moses/TranslationTask.h85
-rw-r--r--moses/TypeDef.h8
-rw-r--r--moses/Util.cpp41
-rw-r--r--moses/Util.h14
-rw-r--r--moses/XmlOption.cpp48
-rw-r--r--moses/mbr.cpp (renamed from moses-cmd/mbr.cpp)0
-rw-r--r--moses/mbr.h (renamed from moses-cmd/mbr.h)0
-rw-r--r--phrase-extract/ScoreFeatureTest.cpp20
-rw-r--r--phrase-extract/consolidate-main.cpp2
-rw-r--r--phrase-extract/extract-ghkm/Alignment.cpp8
-rw-r--r--phrase-extract/extract-ghkm/Alignment.h2
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.cpp22
-rw-r--r--phrase-extract/extract-ghkm/Options.h20
-rw-r--r--phrase-extract/extract-ghkm/PhraseOrientation.cpp1
-rw-r--r--phrase-extract/extract-ghkm/PhraseOrientation.h1
-rw-r--r--phrase-extract/extract-ghkm/ScfgRule.h2
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.cpp22
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.h2
-rw-r--r--phrase-extract/extract-rules-main.cpp10
-rw-r--r--phrase-extract/score-main.cpp2
-rw-r--r--regression-testing/Jamfile2
-rw-r--r--scripts/ems/experiment.meta45
-rwxr-xr-xscripts/ems/experiment.perl7
-rwxr-xr-xscripts/ems/support/analysis.perl118
-rwxr-xr-xscripts/ems/support/build-sparse-features.perl8
-rwxr-xr-xscripts/ems/support/substitute-filtered-tables.perl4
-rwxr-xr-xscripts/ems/support/thot-lm-wrapper.perl20
-rwxr-xr-xscripts/generic/moses_sim_pe.py29
-rw-r--r--scripts/nplm-training/README9
-rwxr-xr-xscripts/nplm-training/averageNullEmbedding_baseline.py44
-rwxr-xr-xscripts/nplm-training/extract.py121
-rwxr-xr-xscripts/nplm-training/extract_test.py73
-rwxr-xr-xscripts/nplm-training/extract_training.py167
-rwxr-xr-xscripts/nplm-training/reduce_ngrams.py28
-rwxr-xr-xscripts/nplm-training/tag.sh17
-rwxr-xr-xscripts/nplm-training/test_nplm.py57
-rwxr-xr-xscripts/nplm-training/train_nplm.py121
-rwxr-xr-xscripts/other/blame-stat.sh4
-rwxr-xr-xscripts/recaser/train-recaser.perl2
-rwxr-xr-xscripts/tokenizer/deescape-special-chars-PTB.perl19
-rwxr-xr-xscripts/tokenizer/tokenizer_PTB.perl399
-rwxr-xr-xscripts/training/filter-model-given-input.pl3
-rwxr-xr-xscripts/training/mert-moses.pl12
-rwxr-xr-xscripts/training/train-model.perl13
-rwxr-xr-xscripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl54
-rwxr-xr-xscripts/training/wrappers/make-factor-brown-cluster-mkcls.perl3
-rw-r--r--search/edge_generator.hh2
-rw-r--r--search/types.hh2
-rw-r--r--search/vertex_generator.hh2
-rw-r--r--util/file.cc6
-rw-r--r--util/file.hh26
-rw-r--r--util/scoped.cc4
-rw-r--r--util/scoped.hh123
272 files changed, 11247 insertions, 4974 deletions
diff --git a/.gitignore b/.gitignore
index e7c37d86c..9c82eb9f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+tools
*.d
*.pyc
*.lo
diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt
index 692df8616..d3983fd18 100644
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@@ -1,3 +1,4 @@
Please see the Moses website on how to compile and run Moses
http://www.statmt.org/moses/?n=Development.GetStarted
+
diff --git a/Jamroot b/Jamroot
index 04e49a5ce..dc0dd345f 100644
--- a/Jamroot
+++ b/Jamroot
@@ -70,6 +70,7 @@
#-a to build from scratch
#-j$NCPUS to compile in parallel
#--clean to clean
+#--debug-build to build with Og. Only available with gcc 4.8+
import option ;
import modules ;
@@ -97,6 +98,11 @@ if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_min
echo "Tip: install tcmalloc for faster threading. See BUILD-INSTRUCTIONS.txt for more information." ;
}
+if [ option.get "debug-build" : : "yes" ] {
+ requirements += <cxxflags>-Og ;
+ echo "Building with -Og to enable easier profiling and debugging. Only available on gcc 4.8+." ;
+}
+
if [ option.get "enable-mpi" : : "yes" ] {
import mpi ;
using mpi ;
@@ -114,7 +120,7 @@ requirements += [ option.get "with-mm" : : <define>PT_UG ] ;
requirements += [ option.get "with-mm" : : <define>MAX_NUM_FACTORS=4 ] ;
requirements += [ option.get "unlabelled-source" : : <define>UNLABELLED_SOURCE ] ;
-if [ option.get "with-lbllm" ] {
+if [ option.get "with-oxlm" ] {
external-lib boost_serialization ;
external-lib gomp ;
requirements += <library>boost_serialization ;
@@ -167,7 +173,7 @@ project : requirements
;
#Add directories here if you want their incidental targets too (i.e. tests).
-build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses-chart-cmd mira scripts regression-testing ;
+build-projects lm util phrase-extract search moses moses/LM mert moses-cmd mira scripts regression-testing ;
if [ option.get "with-mm" : : "yes" ]
{
@@ -175,7 +181,6 @@ if [ option.get "with-mm" : : "yes" ]
moses/TranslationModel/UG//ptable-describe-features
moses/TranslationModel/UG//count-ptable-features
moses/TranslationModel/UG//ptable-lookup
- moses/TranslationModel/UG//sim-pe
moses/TranslationModel/UG//spe-check-coverage
moses/TranslationModel/UG/mm//mtt-demo1
moses/TranslationModel/UG/mm//mtt-build
@@ -195,9 +200,19 @@ else
alias mm ;
}
+if [ option.get "with-rephraser" : : "yes" ]
+{
+ alias rephraser :
+ contrib/rephraser//paraphrase
+ ;
+}
+else
+{
+ alias rephraser ;
+}
+
alias programs :
lm//programs
-moses-chart-cmd//moses_chart
moses-cmd//programs
OnDiskPt//CreateOnDiskPt
OnDiskPt//queryOnDiskPt
@@ -214,11 +229,12 @@ biconcor
mira//mira
contrib/server//mosesserver
mm
+rephraser
;
install-bin-libs programs ;
-install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-chart-cmd moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
+install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
install-headers headers-moses : moses//headers-to-install : moses ;
alias install : prefix-bin prefix-lib headers-base headers-moses ;
@@ -232,3 +248,10 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
echo "To disable this message, delete $(TOP)/dist ." ;
echo ;
}
+
+local temp = [ _shell "mkdir bin" ] ;
+local temp = [ _shell "rm bin/moses_chart" ] ;
+local temp = [ _shell "cd bin && ln -s moses moses_chart" ] ;
+
+
+
diff --git a/contrib/combine-ptables/README.md b/contrib/combine-ptables/README.md
index b180f9202..14d13857d 100644
--- a/contrib/combine-ptables/README.md
+++ b/contrib/combine-ptables/README.md
@@ -59,6 +59,14 @@ This corresponds to the option:
`--newSourceMaxLength=4`
+BACKOFF
+-------
+
+This combination technique is a simplified version of the fill-up technique.
+With respect to fill-up technique, the backoff technique does not add the
+binary additional feature denoting the provenance of an entry.
+
+
LINEAR INTERPOLATION
--------------------
diff --git a/contrib/combine-ptables/combine-ptables.pl b/contrib/combine-ptables/combine-ptables.pl
index de9df7ec2..776565640 100755
--- a/contrib/combine-ptables/combine-ptables.pl
+++ b/contrib/combine-ptables/combine-ptables.pl
@@ -17,7 +17,7 @@ sub main {
my $usage = "
USAGE
-----
-combine-ptables.pl --mode=(interp|union|fillup|intersect1|stats) ptable1 ptable2 ... ptableN > combined-ptable
+combine-ptables.pl --mode=(interp|union|fillup|backoff|intersect1|stats) ptable1 ptable2 ... ptableN > combined-ptable
combine-ptables.pl --mode=intersect1 reotable-unpruned ptable-pruned > reotable-pruned
-----
#
@@ -32,6 +32,8 @@ combine-ptables.pl --mode=intersect1 reotable-unpruned ptable-pruned > reotable-
# Required:
# --mode fillup: Each entry is taken only from the first table that contains it.
# A binary feature is added from each table except the first.
+# backoff: Each entry is taken only from the first table that contains it.
+# NO binary feature is added.
# interp: Linear interpolation.
# union: Union of entries, feature vectors are concatenated.
# intersect1: Intersection of entries, feature vectors taken from the first table.
@@ -51,6 +53,9 @@ combine-ptables.pl --mode=intersect1 reotable-unpruned ptable-pruned > reotable-
# Options for 'fillup':
# --newSourceMaxLength=INT Don't include \"new\" source phrases if longer than INT words.
#
+# Options for 'backoff':
+# --newSourceMaxLength=INT Don't include \"new\" source phrases if longer than INT words.
+#
# Options for 'interp':
# --weights=W1,W2,...WN Weights for interpolation. By default, uniform weights are applied.
# --epsilon=X Score to assume when a phrase pair is not contained in a table (in 'interp' and 'union' modes).
@@ -85,7 +90,7 @@ GetOptions ('debug' => \$debug,
if($help) { die "$usage\n\n"; }
-if($combination_mode!~/(interp|union|fillup|intersect1|stats)/) {die "$usage\nUnknown combination mode!\n"};
+if($combination_mode!~/(interp|union|fillup|backoff|intersect1|stats)/) {die "$usage\nUnknown combination mode!\n"};
if(@ARGV < 2) {die "$usage\n\n Please provide at least 2 tables to combine \n\n";}
@@ -103,7 +108,7 @@ my $nbtables = scalar(@tables);
# The newSourceMaxLength option requires reading all the first PT before starting the combination
my %sourcePhrasesPT1;
-if($combination_mode eq "fillup" && $newSourceMaxLength>-1) {
+if((($combination_mode eq "fillup") || ($combination_mode eq "backoff")) && $newSourceMaxLength>-1) {
my $table1=$tables[0];
$table1 =~ s/(.*\.gz)\s*$/gzip -dc < $1|/;
open(TABLE1, "$table1") or die "Cannot open $table1: ($!)\n";
@@ -281,6 +286,25 @@ sub combine_ppair(PPAIRS_REFARRAY, TABLE_INDICES_REFARRAY) {
}
push(@scores, @bin_feats);
}
+ ### Backoff
+ elsif($combination_mode eq "backoff") {
+ #my @bin_feats=(($exp_zero) x ($nbtables-1));
+ for(my $i=0; $i<$nbtables; $i++) {
+ if($ra_toRead->[$i]) {
+ $ppair= shift(@{$ra_ppairs->[$i]});
+ # pruning criteria are applied here:
+ if($i>0 && $newSourceMaxLength>-1) {
+ $ppair=~m/^(.*?)$delim_RE/;
+ if(scalar(split(/ +/, $1)) > $newSourceMaxLength &&
+ !defined($sourcePhrasesPT1{$1}))
+ { $to_print=0; }
+ }
+ @scores = @{shift(@{$ra_ppairs->[$i]})};
+ $additional_info=shift(@{$ra_ppairs->[$i]});
+ last;
+ }
+ }
+ }
### Linear interpolation
elsif($combination_mode eq "interp") {
my $firstPpair=-1;
diff --git a/contrib/other-builds/CreateOnDiskPt/.cproject b/contrib/other-builds/CreateOnDiskPt/.cproject
index 84c847336..e5082178a 100644
--- a/contrib/other-builds/CreateOnDiskPt/.cproject
+++ b/contrib/other-builds/CreateOnDiskPt/.cproject
@@ -5,12 +5,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.602770742" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -18,7 +18,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.602770742." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1436139469" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.622899770" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/CreateOnDiskPt}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1448999623" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/CreateOnDiskPt}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1448999623" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.2139008298" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2008193341" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.627728792" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -54,6 +54,7 @@
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="bz2"/>
<listOptionValue builtIn="false" value="dl"/>
+ <listOptionValue builtIn="false" value="rt"/>
</option>
<option id="gnu.cpp.link.option.paths.815001500" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
@@ -81,12 +82,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.168814843" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -142,5 +143,12 @@
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/CreateOnDiskPt"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/CreateOnDiskPt"/>
+ </configuration>
+ </storageModule>
</cproject>
diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject
index e32a5baea..f551380fd 100644
--- a/contrib/other-builds/OnDiskPt/.cproject
+++ b/contrib/other-builds/OnDiskPt/.cproject
@@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -72,13 +72,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
diff --git a/contrib/other-builds/consolidate/.cproject b/contrib/other-builds/consolidate/.cproject
index 9caa531d6..4593957dc 100644
--- a/contrib/other-builds/consolidate/.cproject
+++ b/contrib/other-builds/consolidate/.cproject
@@ -5,12 +5,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2091728208" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -18,7 +18,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2091728208." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.69362991" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.641760346" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/consolidate}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1286696537" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/consolidate}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1286696537" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1571215005" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1186248186" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -37,14 +37,27 @@
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.110628197" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.libs.1393924562" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="moses"/>
+ <listOptionValue builtIn="false" value="search"/>
+ <listOptionValue builtIn="false" value="OnDiskPt"/>
+ <listOptionValue builtIn="false" value="lm"/>
<listOptionValue builtIn="false" value="util"/>
<listOptionValue builtIn="false" value="boost_iostreams"/>
+ <listOptionValue builtIn="false" value="boost_serialization"/>
+ <listOptionValue builtIn="false" value="boost_system"/>
+ <listOptionValue builtIn="false" value="boost_thread"/>
+ <listOptionValue builtIn="false" value="boost_filesystem"/>
+ <listOptionValue builtIn="false" value="pthread"/>
<listOptionValue builtIn="false" value="z"/>
+ <listOptionValue builtIn="false" value="dl"/>
+ <listOptionValue builtIn="false" value="rt"/>
</option>
<option id="gnu.cpp.link.option.paths.1967422094" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/search/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/OnDiskPt/Debug&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1093223502" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
@@ -73,12 +86,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.185559773" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -134,5 +147,13 @@
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/consolidate"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/consolidate"/>
+ </configuration>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>
diff --git a/contrib/other-builds/consolidate/.project b/contrib/other-builds/consolidate/.project
index 7e5995030..db9a1fa8f 100644
--- a/contrib/other-builds/consolidate/.project
+++ b/contrib/other-builds/consolidate/.project
@@ -3,7 +3,10 @@
<name>consolidate</name>
<comment></comment>
<projects>
+ <project>lm</project>
<project>moses</project>
+ <project>OnDiskPt</project>
+ <project>search</project>
<project>util</project>
</projects>
<buildSpec>
diff --git a/contrib/other-builds/extract-ghkm/.cproject b/contrib/other-builds/extract-ghkm/.cproject
index 6d63edd2e..a567905ee 100644
--- a/contrib/other-builds/extract-ghkm/.cproject
+++ b/contrib/other-builds/extract-ghkm/.cproject
@@ -18,7 +18,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1975272196." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1513645956" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.621141597" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/extract-ghkm}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1641243676" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/extract-ghkm}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1641243676" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.150240237" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.494510261" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.520735766" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
diff --git a/contrib/other-builds/extract-mixed-syntax/.cproject b/contrib/other-builds/extract-mixed-syntax/.cproject
index 736e79926..f246b0c32 100644
--- a/contrib/other-builds/extract-mixed-syntax/.cproject
+++ b/contrib/other-builds/extract-mixed-syntax/.cproject
@@ -18,7 +18,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1409305044." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1388217813" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.933039924" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/extract-mixed-syntax}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.48110463" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/extract-mixed-syntax}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.48110463" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.98916974" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1188224255" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.391351501" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -138,5 +138,12 @@
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
+ </configuration>
+ </storageModule>
</cproject>
diff --git a/contrib/other-builds/extract-mixed-syntax/.project b/contrib/other-builds/extract-mixed-syntax/.project
index 0f05a7b41..7fdbadabb 100644
--- a/contrib/other-builds/extract-mixed-syntax/.project
+++ b/contrib/other-builds/extract-mixed-syntax/.project
@@ -3,6 +3,7 @@
<name>extract-mixed-syntax</name>
<comment></comment>
<projects>
+ <project>util</project>
</projects>
<buildSpec>
<buildCommand>
diff --git a/contrib/other-builds/extract-rules/.cproject b/contrib/other-builds/extract-rules/.cproject
index 487d8f89f..e79f0f526 100644
--- a/contrib/other-builds/extract-rules/.cproject
+++ b/contrib/other-builds/extract-rules/.cproject
@@ -18,7 +18,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1909818145." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.702289239" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.769221744" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/extract-rules}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1538811811" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/extract-rules}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1538811811" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.417385938" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.274036343" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1227466042" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -121,5 +121,12 @@
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/extract-rules"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/extract-rules"/>
+ </configuration>
+ </storageModule>
</cproject>
diff --git a/contrib/other-builds/extract/.cproject b/contrib/other-builds/extract/.cproject
index cd8103241..10701cb6e 100644
--- a/contrib/other-builds/extract/.cproject
+++ b/contrib/other-builds/extract/.cproject
@@ -18,7 +18,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2119725657." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1708444053" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.645190133" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/extract}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1816006533" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/extract}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1816006533" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.876593881" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1859867372" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1585316374" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject
index 8e423b3b0..613c41d5c 100644
--- a/contrib/other-builds/extractor/.cproject
+++ b/contrib/other-builds/extractor/.cproject
@@ -5,12 +5,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1133345948" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -48,6 +48,7 @@
<listOptionValue builtIn="false" value="boost_thread"/>
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="pthread"/>
+ <listOptionValue builtIn="false" value="rt"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.656319745" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
@@ -67,12 +68,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1385955159" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
diff --git a/contrib/other-builds/manual-label/.cproject b/contrib/other-builds/manual-label/.cproject
index 3a6b129d4..d9297a9fc 100644
--- a/contrib/other-builds/manual-label/.cproject
+++ b/contrib/other-builds/manual-label/.cproject
@@ -18,7 +18,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2107801703." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.502948364" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.1431969079" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/manual-label}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.2101075234" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/manual-label}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.2101075234" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1118840081" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2037265673" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.400985496" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -121,5 +121,12 @@
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/manual-label"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/manual-label"/>
+ </configuration>
+ </storageModule>
</cproject>
diff --git a/contrib/other-builds/moses-chart-cmd.vcxproj b/contrib/other-builds/moses-chart-cmd.vcxproj
deleted file mode 100644
index 25fe74588..000000000
--- a/contrib/other-builds/moses-chart-cmd.vcxproj
+++ /dev/null
@@ -1,115 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
- <ItemGroup Label="ProjectConfigurations">
- <ProjectConfiguration Include="Debug|Win32">
- <Configuration>Debug</Configuration>
- <Platform>Win32</Platform>
- </ProjectConfiguration>
- <ProjectConfiguration Include="Release|Win32">
- <Configuration>Release</Configuration>
- <Platform>Win32</Platform>
- </ProjectConfiguration>
- </ItemGroup>
- <PropertyGroup Label="Globals">
- <ProjectGuid>{C3AF5C05-D4EC-41D2-8319-D1E69B9B5820}</ProjectGuid>
- <RootNamespace>moseschartcmd</RootNamespace>
- <Keyword>Win32Proj</Keyword>
- </PropertyGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
- <ConfigurationType>Application</ConfigurationType>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>true</WholeProgramOptimization>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
- <ConfigurationType>Application</ConfigurationType>
- <CharacterSet>Unicode</CharacterSet>
- </PropertyGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
- <ImportGroup Label="ExtensionSettings">
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <PropertyGroup Label="UserMacros" />
- <PropertyGroup>
- <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
- <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
- <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
- <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
- <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
- <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
- <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
- </PropertyGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
- <ClCompile>
- <Optimization>Disabled</Optimization>
- <AdditionalIncludeDirectories>C:\Program Files\boost\boost_1_47;$(SolutionDir)/moses/src;$(SolutionDir)/kenlm;$(SolutionDir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;LM_INTERNAL;TRACE_ENABLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <MinimalRebuild>true</MinimalRebuild>
- <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
- <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
- </ClCompile>
- <Link>
- <AdditionalDependencies>zdll.lib;$(SolutionDir)$(Configuration)\moses.lib;$(SolutionDir)$(Configuration)\kenlm.lib;%(AdditionalDependencies)</AdditionalDependencies>
- <GenerateDebugInformation>true</GenerateDebugInformation>
- <SubSystem>Console</SubSystem>
- <RandomizedBaseAddress>false</RandomizedBaseAddress>
- <DataExecutionPrevention>
- </DataExecutionPrevention>
- <TargetMachine>MachineX86</TargetMachine>
- </Link>
- </ItemDefinitionGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
- <ClCompile>
- <AdditionalIncludeDirectories>C:\Program Files\boost\boost_1_47;$(SolutionDir)/moses/src;$(SolutionDir)/kenlm;$(SolutionDir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;LM_INTERNAL;TRACE_ENABLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
- </ClCompile>
- <Link>
- <AdditionalDependencies>zdll.lib;$(SolutionDir)$(Configuration)\moses.lib;$(SolutionDir)$(Configuration)\kenlm.lib;%(AdditionalDependencies)</AdditionalDependencies>
- <GenerateDebugInformation>true</GenerateDebugInformation>
- <SubSystem>Console</SubSystem>
- <OptimizeReferences>true</OptimizeReferences>
- <EnableCOMDATFolding>true</EnableCOMDATFolding>
- <RandomizedBaseAddress>false</RandomizedBaseAddress>
- <DataExecutionPrevention>
- </DataExecutionPrevention>
- <TargetMachine>MachineX86</TargetMachine>
- </Link>
- </ItemDefinitionGroup>
- <ItemGroup>
- <ClCompile Include="src\IOWrapper.cpp" />
- <ClCompile Include="src\Main.cpp" />
- <ClCompile Include="src\mbr.cpp" />
- </ItemGroup>
- <ItemGroup>
- <ClInclude Include="src\IOWrapper.h" />
- <ClInclude Include="src\Main.h" />
- <ClInclude Include="src\mbr.h" />
- </ItemGroup>
- <ItemGroup>
- <ProjectReference Include="..\moses\moses.vcxproj">
- <Project>{8122157a-0de5-44ff-8e5b-024ed6ace7af}</Project>
- <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
- </ProjectReference>
- <ProjectReference Include="..\OnDiskPt\OnDiskPt.vcxproj">
- <Project>{8b07671b-cbaf-4514-affd-ce238cd427e9}</Project>
- <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
- </ProjectReference>
- </ItemGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
- <ImportGroup Label="ExtensionTargets">
- </ImportGroup>
-</Project> \ No newline at end of file
diff --git a/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj b/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj
deleted file mode 100644
index cc0f3caf7..000000000
--- a/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,486 +0,0 @@
-// !$*UTF8*$!
-{
- archiveVersion = 1;
- classes = {
- };
- objectVersion = 45;
- objects = {
-
-/* Begin PBXBuildFile section */
- 1EAF9DC614B9F8CD005E8EBD /* liblm.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EAF9DC314B9F8BA005E8EBD /* liblm.a */; };
- 1EAF9DC714B9F8CD005E8EBD /* libmoses.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EAF9DAD14B9F8AD005E8EBD /* libmoses.a */; };
- 1EAF9DC814B9F8CD005E8EBD /* libOnDiskPt.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EAF9DB614B9F8B1005E8EBD /* libOnDiskPt.a */; };
- 1EBC53E7164C4B1400ADFA2C /* libsearch.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EBC53BD164C4AC300ADFA2C /* libsearch.a */; };
- 1EF0719F14B9F1D40052152A /* IOWrapper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF0718A14B9F1D40052152A /* IOWrapper.cpp */; };
- 1EF071A214B9F1D40052152A /* Main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF0718E14B9F1D40052152A /* Main.cpp */; };
- 1EF071A414B9F1D40052152A /* mbr.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF0719114B9F1D40052152A /* mbr.cpp */; };
- 1EF071A614B9F1D40052152A /* TranslationAnalysis.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF0719414B9F1D40052152A /* TranslationAnalysis.cpp */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXContainerItemProxy section */
- 1EAF9DAC14B9F8AD005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */;
- proxyType = 2;
- remoteGlobalIDString = D2AAC046055464E500DB518D;
- remoteInfo = moses;
- };
- 1EAF9DB514B9F8B1005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */;
- proxyType = 2;
- remoteGlobalIDString = D2AAC046055464E500DB518D;
- remoteInfo = OnDiskPt;
- };
- 1EAF9DC214B9F8BA005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */;
- proxyType = 2;
- remoteGlobalIDString = 1EE8C2E91476A48E002496F2;
- remoteInfo = lm;
- };
- 1EAF9DCB14B9F8D6005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */;
- proxyType = 1;
- remoteGlobalIDString = D2AAC045055464E500DB518D;
- remoteInfo = OnDiskPt;
- };
- 1EAF9DCD14B9F8D6005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */;
- proxyType = 1;
- remoteGlobalIDString = D2AAC045055464E500DB518D;
- remoteInfo = moses;
- };
- 1EAF9DCF14B9F8D6005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */;
- proxyType = 1;
- remoteGlobalIDString = 1EE8C2E81476A48E002496F2;
- remoteInfo = lm;
- };
- 1EBC53BC164C4AC300ADFA2C /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */;
- proxyType = 2;
- remoteGlobalIDString = 1EBC53AE164C4A6200ADFA2C;
- remoteInfo = search;
- };
- 1EBC53E5164C4AFC00ADFA2C /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */;
- proxyType = 1;
- remoteGlobalIDString = 1EBC53AD164C4A6200ADFA2C;
- remoteInfo = search;
- };
-/* End PBXContainerItemProxy section */
-
-/* Begin PBXCopyFilesBuildPhase section */
- 8DD76F690486A84900D96B5E /* CopyFiles */ = {
- isa = PBXCopyFilesBuildPhase;
- buildActionMask = 8;
- dstPath = /usr/share/man/man1/;
- dstSubfolderSpec = 0;
- files = (
- );
- runOnlyForDeploymentPostprocessing = 1;
- };
-/* End PBXCopyFilesBuildPhase section */
-
-/* Begin PBXFileReference section */
- 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = moses.xcodeproj; sourceTree = "<group>"; };
- 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = OnDiskPt.xcodeproj; sourceTree = "<group>"; };
- 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = lm.xcodeproj; sourceTree = "<group>"; };
- 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = search.xcodeproj; sourceTree = "<group>"; };
- 1EF0718A14B9F1D40052152A /* IOWrapper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = IOWrapper.cpp; path = "../../moses-chart-cmd/src/IOWrapper.cpp"; sourceTree = "<group>"; };
- 1EF0718B14B9F1D40052152A /* IOWrapper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = IOWrapper.h; path = "../../moses-chart-cmd/src/IOWrapper.h"; sourceTree = "<group>"; };
- 1EF0718E14B9F1D40052152A /* Main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Main.cpp; path = "../../moses-chart-cmd/src/Main.cpp"; sourceTree = "<group>"; };
- 1EF0718F14B9F1D40052152A /* Main.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Main.h; path = "../../moses-chart-cmd/src/Main.h"; sourceTree = "<group>"; };
- 1EF0719114B9F1D40052152A /* mbr.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = mbr.cpp; path = "../../moses-chart-cmd/src/mbr.cpp"; sourceTree = "<group>"; };
- 1EF0719214B9F1D40052152A /* mbr.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = mbr.h; path = "../../moses-chart-cmd/src/mbr.h"; sourceTree = "<group>"; };
- 1EF0719414B9F1D40052152A /* TranslationAnalysis.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = TranslationAnalysis.cpp; path = "../../moses-chart-cmd/src/TranslationAnalysis.cpp"; sourceTree = "<group>"; };
- 1EF0719514B9F1D40052152A /* TranslationAnalysis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = TranslationAnalysis.h; path = "../../moses-chart-cmd/src/TranslationAnalysis.h"; sourceTree = "<group>"; };
- 8DD76F6C0486A84900D96B5E /* moses-chart-cmd */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "moses-chart-cmd"; sourceTree = BUILT_PRODUCTS_DIR; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
- 8DD76F660486A84900D96B5E /* Frameworks */ = {
- isa = PBXFrameworksBuildPhase;
- buildActionMask = 2147483647;
- files = (
- 1EBC53E7164C4B1400ADFA2C /* libsearch.a in Frameworks */,
- 1EAF9DC614B9F8CD005E8EBD /* liblm.a in Frameworks */,
- 1EAF9DC714B9F8CD005E8EBD /* libmoses.a in Frameworks */,
- 1EAF9DC814B9F8CD005E8EBD /* libOnDiskPt.a in Frameworks */,
- );
- runOnlyForDeploymentPostprocessing = 0;
- };
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
- 08FB7794FE84155DC02AAC07 /* moses-chart-cmd */ = {
- isa = PBXGroup;
- children = (
- 08FB7795FE84155DC02AAC07 /* Source */,
- C6859E8C029090F304C91782 /* Documentation */,
- 1AB674ADFE9D54B511CA2CBB /* Products */,
- 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */,
- 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */,
- 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */,
- 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */,
- );
- name = "moses-chart-cmd";
- sourceTree = "<group>";
- };
- 08FB7795FE84155DC02AAC07 /* Source */ = {
- isa = PBXGroup;
- children = (
- 1EF0718A14B9F1D40052152A /* IOWrapper.cpp */,
- 1EF0718B14B9F1D40052152A /* IOWrapper.h */,
- 1EF0718E14B9F1D40052152A /* Main.cpp */,
- 1EF0718F14B9F1D40052152A /* Main.h */,
- 1EF0719114B9F1D40052152A /* mbr.cpp */,
- 1EF0719214B9F1D40052152A /* mbr.h */,
- 1EF0719414B9F1D40052152A /* TranslationAnalysis.cpp */,
- 1EF0719514B9F1D40052152A /* TranslationAnalysis.h */,
- );
- name = Source;
- sourceTree = "<group>";
- };
- 1AB674ADFE9D54B511CA2CBB /* Products */ = {
- isa = PBXGroup;
- children = (
- 8DD76F6C0486A84900D96B5E /* moses-chart-cmd */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- 1EAF9DA614B9F8AD005E8EBD /* Products */ = {
- isa = PBXGroup;
- children = (
- 1EAF9DAD14B9F8AD005E8EBD /* libmoses.a */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- 1EAF9DAF14B9F8B1005E8EBD /* Products */ = {
- isa = PBXGroup;
- children = (
- 1EAF9DB614B9F8B1005E8EBD /* libOnDiskPt.a */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- 1EAF9DB814B9F8B9005E8EBD /* Products */ = {
- isa = PBXGroup;
- children = (
- 1EAF9DC314B9F8BA005E8EBD /* liblm.a */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- 1EBC53B6164C4AC300ADFA2C /* Products */ = {
- isa = PBXGroup;
- children = (
- 1EBC53BD164C4AC300ADFA2C /* libsearch.a */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- C6859E8C029090F304C91782 /* Documentation */ = {
- isa = PBXGroup;
- children = (
- );
- name = Documentation;
- sourceTree = "<group>";
- };
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
- 8DD76F620486A84900D96B5E /* moses-chart-cmd */ = {
- isa = PBXNativeTarget;
- buildConfigurationList = 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "moses-chart-cmd" */;
- buildPhases = (
- 8DD76F640486A84900D96B5E /* Sources */,
- 8DD76F660486A84900D96B5E /* Frameworks */,
- 8DD76F690486A84900D96B5E /* CopyFiles */,
- );
- buildRules = (
- );
- dependencies = (
- 1EBC53E6164C4AFC00ADFA2C /* PBXTargetDependency */,
- 1EAF9DCC14B9F8D6005E8EBD /* PBXTargetDependency */,
- 1EAF9DCE14B9F8D6005E8EBD /* PBXTargetDependency */,
- 1EAF9DD014B9F8D6005E8EBD /* PBXTargetDependency */,
- );
- name = "moses-chart-cmd";
- productInstallPath = "$(HOME)/bin";
- productName = "moses-chart-cmd";
- productReference = 8DD76F6C0486A84900D96B5E /* moses-chart-cmd */;
- productType = "com.apple.product-type.tool";
- };
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
- 08FB7793FE84155DC02AAC07 /* Project object */ = {
- isa = PBXProject;
- buildConfigurationList = 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "moses-chart-cmd" */;
- compatibilityVersion = "Xcode 3.1";
- developmentRegion = English;
- hasScannedForEncodings = 1;
- knownRegions = (
- English,
- Japanese,
- French,
- German,
- );
- mainGroup = 08FB7794FE84155DC02AAC07 /* moses-chart-cmd */;
- projectDirPath = "";
- projectReferences = (
- {
- ProductGroup = 1EAF9DB814B9F8B9005E8EBD /* Products */;
- ProjectRef = 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */;
- },
- {
- ProductGroup = 1EAF9DA614B9F8AD005E8EBD /* Products */;
- ProjectRef = 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */;
- },
- {
- ProductGroup = 1EAF9DAF14B9F8B1005E8EBD /* Products */;
- ProjectRef = 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */;
- },
- {
- ProductGroup = 1EBC53B6164C4AC300ADFA2C /* Products */;
- ProjectRef = 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */;
- },
- );
- projectRoot = "";
- targets = (
- 8DD76F620486A84900D96B5E /* moses-chart-cmd */,
- );
- };
-/* End PBXProject section */
-
-/* Begin PBXReferenceProxy section */
- 1EAF9DAD14B9F8AD005E8EBD /* libmoses.a */ = {
- isa = PBXReferenceProxy;
- fileType = archive.ar;
- path = libmoses.a;
- remoteRef = 1EAF9DAC14B9F8AD005E8EBD /* PBXContainerItemProxy */;
- sourceTree = BUILT_PRODUCTS_DIR;
- };
- 1EAF9DB614B9F8B1005E8EBD /* libOnDiskPt.a */ = {
- isa = PBXReferenceProxy;
- fileType = archive.ar;
- path = libOnDiskPt.a;
- remoteRef = 1EAF9DB514B9F8B1005E8EBD /* PBXContainerItemProxy */;
- sourceTree = BUILT_PRODUCTS_DIR;
- };
- 1EAF9DC314B9F8BA005E8EBD /* liblm.a */ = {
- isa = PBXReferenceProxy;
- fileType = archive.ar;
- path = liblm.a;
- remoteRef = 1EAF9DC214B9F8BA005E8EBD /* PBXContainerItemProxy */;
- sourceTree = BUILT_PRODUCTS_DIR;
- };
- 1EBC53BD164C4AC300ADFA2C /* libsearch.a */ = {
- isa = PBXReferenceProxy;
- fileType = archive.ar;
- path = libsearch.a;
- remoteRef = 1EBC53BC164C4AC300ADFA2C /* PBXContainerItemProxy */;
- sourceTree = BUILT_PRODUCTS_DIR;
- };
-/* End PBXReferenceProxy section */
-
-/* Begin PBXSourcesBuildPhase section */
- 8DD76F640486A84900D96B5E /* Sources */ = {
- isa = PBXSourcesBuildPhase;
- buildActionMask = 2147483647;
- files = (
- 1EF0719F14B9F1D40052152A /* IOWrapper.cpp in Sources */,
- 1EF071A214B9F1D40052152A /* Main.cpp in Sources */,
- 1EF071A414B9F1D40052152A /* mbr.cpp in Sources */,
- 1EF071A614B9F1D40052152A /* TranslationAnalysis.cpp in Sources */,
- );
- runOnlyForDeploymentPostprocessing = 0;
- };
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXTargetDependency section */
- 1EAF9DCC14B9F8D6005E8EBD /* PBXTargetDependency */ = {
- isa = PBXTargetDependency;
- name = OnDiskPt;
- targetProxy = 1EAF9DCB14B9F8D6005E8EBD /* PBXContainerItemProxy */;
- };
- 1EAF9DCE14B9F8D6005E8EBD /* PBXTargetDependency */ = {
- isa = PBXTargetDependency;
- name = moses;
- targetProxy = 1EAF9DCD14B9F8D6005E8EBD /* PBXContainerItemProxy */;
- };
- 1EAF9DD014B9F8D6005E8EBD /* PBXTargetDependency */ = {
- isa = PBXTargetDependency;
- name = lm;
- targetProxy = 1EAF9DCF14B9F8D6005E8EBD /* PBXContainerItemProxy */;
- };
- 1EBC53E6164C4AFC00ADFA2C /* PBXTargetDependency */ = {
- isa = PBXTargetDependency;
- name = search;
- targetProxy = 1EBC53E5164C4AFC00ADFA2C /* PBXContainerItemProxy */;
- };
-/* End PBXTargetDependency section */
-
-/* Begin XCBuildConfiguration section */
- 1DEB923208733DC60010E9CD /* Debug */ = {
- isa = XCBuildConfiguration;
- buildSettings = {
- ALWAYS_SEARCH_USER_PATHS = NO;
- ARCHS = "$(ARCHS_STANDARD_64_BIT)";
- COPY_PHASE_STRIP = NO;
- GCC_DYNAMIC_NO_PIC = NO;
- GCC_ENABLE_FIX_AND_CONTINUE = YES;
- GCC_MODEL_TUNING = G5;
- GCC_OPTIMIZATION_LEVEL = 0;
- GCC_PREPROCESSOR_DEFINITIONS = (
- TRACE_ENABLE,
- _LARGE_FILES,
- "_FILE_OFFSET_BITS=64",
- "MAX_NUM_FACTORS=4",
- );
- HEADER_SEARCH_PATHS = /opt/local/include;
- INSTALL_PATH = /usr/local/bin;
- LIBRARY_SEARCH_PATHS = (
- ../../irstlm/lib,
- ../../srilm/lib/macosx,
- /opt/local/lib,
- ../../cmph/lib,
- );
- OTHER_LDFLAGS = (
- "-lz",
- "-lirstlm",
- "-lmisc",
- "-ldstruct",
- "-loolm",
- "-lflm",
- "-llattice",
- "-lboost_thread-mt",
- "-lboost_filesystem-mt",
- "-lboost_system-mt",
- "-lcmph",
- );
- PRODUCT_NAME = "moses-chart-cmd";
- SDKROOT = "";
- USER_HEADER_SEARCH_PATHS = "../../ ../../moses/src";
- };
- name = Debug;
- };
- 1DEB923308733DC60010E9CD /* Release */ = {
- isa = XCBuildConfiguration;
- buildSettings = {
- ALWAYS_SEARCH_USER_PATHS = NO;
- ARCHS = "$(ARCHS_STANDARD_64_BIT)";
- DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
- GCC_MODEL_TUNING = G5;
- GCC_PREPROCESSOR_DEFINITIONS = (
- TRACE_ENABLE,
- _LARGE_FILES,
- "_FILE_OFFSET_BITS=64",
- "MAX_NUM_FACTORS=4",
- );
- HEADER_SEARCH_PATHS = /opt/local/include;
- INSTALL_PATH = /usr/local/bin;
- LIBRARY_SEARCH_PATHS = (
- ../../irstlm/lib,
- ../../srilm/lib/macosx,
- /opt/local/lib,
- ../../cmph/lib,
- );
- OTHER_LDFLAGS = (
- "-lz",
- "-lirstlm",
- "-lmisc",
- "-ldstruct",
- "-loolm",
- "-lflm",
- "-llattice",
- "-lboost_thread-mt",
- "-lboost_filesystem-mt",
- "-lboost_system-mt",
- "-lcmph",
- );
- PRODUCT_NAME = "moses-chart-cmd";
- SDKROOT = "";
- USER_HEADER_SEARCH_PATHS = "../../ ../../moses/src";
- };
- name = Release;
- };
- 1DEB923608733DC60010E9CD /* Debug */ = {
- isa = XCBuildConfiguration;
- buildSettings = {
- ARCHS = "$(ARCHS_STANDARD_64_BIT)";
- GCC_C_LANGUAGE_STANDARD = gnu99;
- GCC_OPTIMIZATION_LEVEL = 0;
- GCC_PREPROCESSOR_DEFINITIONS = (
- TRACE_ENABLE,
- WITH_THREADS,
- );
- GCC_WARN_ABOUT_RETURN_TYPE = YES;
- GCC_WARN_UNUSED_VARIABLE = YES;
- HEADER_SEARCH_PATHS = (
- ../../moses/src,
- ../..,
- "/Users/hieuhoang/workspace/github/moses-smt/moses/src/**",
- );
- ONLY_ACTIVE_ARCH = YES;
- PREBINDING = NO;
- SDKROOT = "";
- };
- name = Debug;
- };
- 1DEB923708733DC60010E9CD /* Release */ = {
- isa = XCBuildConfiguration;
- buildSettings = {
- ARCHS = "$(ARCHS_STANDARD_64_BIT)";
- GCC_C_LANGUAGE_STANDARD = gnu99;
- GCC_PREPROCESSOR_DEFINITIONS = (
- TRACE_ENABLE,
- WITH_THREADS,
- );
- GCC_WARN_ABOUT_RETURN_TYPE = YES;
- GCC_WARN_UNUSED_VARIABLE = YES;
- HEADER_SEARCH_PATHS = (
- ../../moses/src,
- ../..,
- "/Users/hieuhoang/workspace/github/moses-smt/moses/src/**",
- );
- ONLY_ACTIVE_ARCH = YES;
- PREBINDING = NO;
- SDKROOT = "";
- };
- name = Release;
- };
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
- 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "moses-chart-cmd" */ = {
- isa = XCConfigurationList;
- buildConfigurations = (
- 1DEB923208733DC60010E9CD /* Debug */,
- 1DEB923308733DC60010E9CD /* Release */,
- );
- defaultConfigurationIsVisible = 0;
- defaultConfigurationName = Release;
- };
- 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "moses-chart-cmd" */ = {
- isa = XCConfigurationList;
- buildConfigurations = (
- 1DEB923608733DC60010E9CD /* Debug */,
- 1DEB923708733DC60010E9CD /* Release */,
- );
- defaultConfigurationIsVisible = 0;
- defaultConfigurationName = Release;
- };
-/* End XCConfigurationList section */
- };
- rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
-}
diff --git a/contrib/other-builds/moses-chart-cmd/.project b/contrib/other-builds/moses-chart-cmd/.project
deleted file mode 100644
index e59b8b8f7..000000000
--- a/contrib/other-builds/moses-chart-cmd/.project
+++ /dev/null
@@ -1,125 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
- <name>moses-chart-cmd</name>
- <comment></comment>
- <projects>
- <project>lm</project>
- <project>moses</project>
- <project>OnDiskPt</project>
- <project>search</project>
- <project>util</project>
- </projects>
- <buildSpec>
- <buildCommand>
- <name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
- <triggers>clean,full,incremental,</triggers>
- <arguments>
- <dictionary>
- <key>?name?</key>
- <value></value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.append_environment</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.autoBuildTarget</key>
- <value>all</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.buildArguments</key>
- <value>-j3</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.buildCommand</key>
- <value>make</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.buildLocation</key>
- <value>${workspace_loc:/moses-chart-cmd/Debug}</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
- <value>clean</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.contents</key>
- <value>org.eclipse.cdt.make.core.activeConfigSettings</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.enableAutoBuild</key>
- <value>false</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.enableCleanBuild</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.enableFullBuild</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.fullBuildTarget</key>
- <value>all</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.stopOnError</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
- <value>true</value>
- </dictionary>
- </arguments>
- </buildCommand>
- <buildCommand>
- <name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
- <triggers>full,incremental,</triggers>
- <arguments>
- </arguments>
- </buildCommand>
- </buildSpec>
- <natures>
- <nature>org.eclipse.cdt.core.cnature</nature>
- <nature>org.eclipse.cdt.core.ccnature</nature>
- <nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
- <nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
- </natures>
- <linkedResources>
- <link>
- <name>IOWrapper.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/IOWrapper.cpp</locationURI>
- </link>
- <link>
- <name>IOWrapper.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/IOWrapper.h</locationURI>
- </link>
- <link>
- <name>Jamfile</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/Jamfile</locationURI>
- </link>
- <link>
- <name>Main.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/Main.cpp</locationURI>
- </link>
- <link>
- <name>Main.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/Main.h</locationURI>
- </link>
- <link>
- <name>mbr.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/mbr.cpp</locationURI>
- </link>
- <link>
- <name>mbr.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/mbr.h</locationURI>
- </link>
- </linkedResources>
-</projectDescription>
diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject
index 6cb1d0ef7..6ed3d4818 100644
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@@ -5,13 +5,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.461114338" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -69,6 +69,7 @@
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="bz2"/>
<listOptionValue builtIn="false" value="dl"/>
+ <listOptionValue builtIn="false" value="rt"/>
</option>
<option id="gnu.cpp.link.option.userobjs.1542590830" name="Other objects" superClass="gnu.cpp.link.option.userobjs"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.983725033" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
@@ -92,13 +93,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.2121690436" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
diff --git a/contrib/other-builds/moses-cmd/.project b/contrib/other-builds/moses-cmd/.project
index 52e2accfd..312c61654 100644
--- a/contrib/other-builds/moses-cmd/.project
+++ b/contrib/other-builds/moses-cmd/.project
@@ -87,16 +87,6 @@
</natures>
<linkedResources>
<link>
- <name>IOWrapper.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/IOWrapper.cpp</locationURI>
- </link>
- <link>
- <name>IOWrapper.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/IOWrapper.h</locationURI>
- </link>
- <link>
<name>Jamfile</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses-cmd/Jamfile</locationURI>
@@ -116,15 +106,5 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses-cmd/Main.h</locationURI>
</link>
- <link>
- <name>mbr.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/mbr.cpp</locationURI>
- </link>
- <link>
- <name>mbr.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/mbr.h</locationURI>
- </link>
</linkedResources>
</projectDescription>
diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject
index 3599de639..80ef75430 100644
--- a/contrib/other-builds/moses/.cproject
+++ b/contrib/other-builds/moses/.cproject
@@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -74,6 +74,22 @@
</toolChain>
</folderInfo>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.511477442" name="Rand.h" rcbsApplicability="disable" resourcePath="LM/Rand.h" toolsToInvoke=""/>
+<<<<<<< HEAD
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1176518033" name="/" resourcePath="LM/bilingual-lm">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.2110557759" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug" unusedChildren="">
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.2066996463" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base.1976472988"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.293388458" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1851071294" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.2126314903"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.569336804" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1168585173"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.690097122" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.2074660557"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.435042440" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug.933467113"/>
+ </toolChain>
+ </folderInfo>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.790052015" name="IRST.h" rcbsApplicability="disable" resourcePath="LM/IRST.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1211280539" name="DALMWrapper.h" rcbsApplicability="disable" resourcePath="LM/DALMWrapper.h" toolsToInvoke=""/>
+ <sourceEntries>
+ <entry excluding="LM/bilingual-lm|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/symal2mam.cc|TranslationModel/UG/mm/mtt-dump.cc|TranslationModel/UG/mm/mtt-count-words.cc|TranslationModel/UG/mm/mtt-build.cc|TranslationModel/UG/mm/mmlex-lookup.cc|TranslationModel/UG/mm/mmlex-build.cc|TranslationModel/UG/mm/mam_verify.cc|TranslationModel/UG/mm/mam2symal.cc|TranslationModel/UG/mm/custom-pt.cc|TranslationModel/UG/mm/calc-coverage.cc|TranslationModel/UG/mm/mtt.count.cc|TranslationModel/UG/util|LM/oxlm|LM/Rand.h|LM/Rand.cpp|TranslationModel/CompactPT|LM/NeuralLMWrapper.cpp|FF/PhraseLengthFeatureTest.cpp|PhraseLengthFeatureTest.cpp|LM/BackwardTest.cpp|LM/BackwardLMState.h|LM/BackwardLMState.cpp|LM/Backward.h|LM/Backward.cpp|FeatureVectorTest.cpp|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|src/SyntacticLanguageModelState.h|src/SyntacticLanguageModelFiles.h|src/SyntacticLanguageModel.h|src/SyntacticLanguageModel.cpp|src/LM/SRI.h|src/LM/SRI.cpp|src/LM/Rand.h|src/LM/Rand.cpp|src/LM/LDHT.h|src/LM/LDHT.cpp|SyntacticLanguageModelState.h|SyntacticLanguageModelFiles.h|SyntacticLanguageModel.h|SyntacticLanguageModel.cpp|LM/LDHT.h|LM/LDHT.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+=======
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1211280539" name="DALMWrapper.h" rcbsApplicability="disable" resourcePath="LM/DALMWrapper.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.790052015" name="IRST.h" rcbsApplicability="disable" resourcePath="LM/IRST.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1845526535" name="SRI.h" rcbsApplicability="disable" resourcePath="LM/SRI.h" toolsToInvoke=""/>
@@ -141,7 +157,23 @@
</toolChain>
</folderInfo>
<sourceEntries>
- <entry excluding="LM/IRST.h|LM/IRST.cpp|LM/SRI.h|LM/SRI.cpp|TranslationModel/UG|LM/DALMWrapper.h|LM/DALMWrapper.cpp|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/symal2mam.cc|TranslationModel/UG/mm/mtt-dump.cc|TranslationModel/UG/mm/mtt-count-words.cc|TranslationModel/UG/mm/mtt-build.cc|TranslationModel/UG/mm/mmlex-lookup.cc|TranslationModel/UG/mm/mmlex-build.cc|TranslationModel/UG/mm/mam_verify.cc|TranslationModel/UG/mm/mam2symal.cc|TranslationModel/UG/mm/custom-pt.cc|TranslationModel/UG/mm/calc-coverage.cc|TranslationModel/UG/mm/mtt.count.cc|TranslationModel/UG/util|LM/oxlm|LM/Rand.h|LM/Rand.cpp|TranslationModel/CompactPT|LM/NeuralLMWrapper.cpp|FF/PhraseLengthFeatureTest.cpp|PhraseLengthFeatureTest.cpp|LM/BackwardTest.cpp|LM/BackwardLMState.h|LM/BackwardLMState.cpp|LM/Backward.h|LM/Backward.cpp|FeatureVectorTest.cpp|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|src/SyntacticLanguageModelState.h|src/SyntacticLanguageModelFiles.h|src/SyntacticLanguageModel.h|src/SyntacticLanguageModel.cpp|src/LM/SRI.h|src/LM/SRI.cpp|src/LM/Rand.h|src/LM/Rand.cpp|src/LM/LDHT.h|src/LM/LDHT.cpp|SyntacticLanguageModelState.h|SyntacticLanguageModelFiles.h|SyntacticLanguageModel.h|SyntacticLanguageModel.cpp|LM/LDHT.h|LM/LDHT.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+ <entry excluding="LM/SRI.h|LM/SRI.cpp|TranslationModel/UG|LM/DALMWrapper.h|LM/DALMWrapper.cpp|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/symal2mam.cc|TranslationModel/UG/mm/mtt-dump.cc|TranslationModel/UG/mm/mtt-count-words.cc|TranslationModel/UG/mm/mtt-build.cc|TranslationModel/UG/mm/mmlex-lookup.cc|TranslationModel/UG/mm/mmlex-build.cc|TranslationModel/UG/mm/mam_verify.cc|TranslationModel/UG/mm/mam2symal.cc|TranslationModel/UG/mm/custom-pt.cc|TranslationModel/UG/mm/calc-coverage.cc|TranslationModel/UG/mm/mtt.count.cc|TranslationModel/UG/util|LM/oxlm|LM/Rand.h|LM/Rand.cpp|TranslationModel/CompactPT|LM/NeuralLMWrapper.cpp|FF/PhraseLengthFeatureTest.cpp|PhraseLengthFeatureTest.cpp|LM/BackwardTest.cpp|LM/BackwardLMState.h|LM/BackwardLMState.cpp|LM/Backward.h|LM/Backward.cpp|FeatureVectorTest.cpp|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|src/SyntacticLanguageModelState.h|src/SyntacticLanguageModelFiles.h|src/SyntacticLanguageModel.h|src/SyntacticLanguageModel.cpp|src/LM/SRI.h|src/LM/SRI.cpp|src/LM/Rand.h|src/LM/Rand.cpp|src/LM/LDHT.h|src/LM/LDHT.cpp|SyntacticLanguageModelState.h|SyntacticLanguageModelFiles.h|SyntacticLanguageModel.h|SyntacticLanguageModel.cpp|LM/LDHT.h|LM/LDHT.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+>>>>>>> master
+ </sourceEntries>
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1176518033" name="/" resourcePath="LM/bilingual-lm">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.2110557759" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug" unusedChildren="">
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.2066996463" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base.1976472988"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.293388458" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1851071294" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.2126314903"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.569336804" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1168585173"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.690097122" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.2074660557"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.435042440" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug.933467113"/>
+ </toolChain>
+ </folderInfo>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.790052015" name="IRST.h" rcbsApplicability="disable" resourcePath="LM/IRST.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1211280539" name="DALMWrapper.h" rcbsApplicability="disable" resourcePath="LM/DALMWrapper.h" toolsToInvoke=""/>
+ <sourceEntries>
+ <entry excluding="LM/bilingual-lm|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/symal2mam.cc|TranslationModel/UG/mm/mtt-dump.cc|TranslationModel/UG/mm/mtt-count-words.cc|TranslationModel/UG/mm/mtt-build.cc|TranslationModel/UG/mm/mmlex-lookup.cc|TranslationModel/UG/mm/mmlex-build.cc|TranslationModel/UG/mm/mam_verify.cc|TranslationModel/UG/mm/mam2symal.cc|TranslationModel/UG/mm/custom-pt.cc|TranslationModel/UG/mm/calc-coverage.cc|TranslationModel/UG/mm/mtt.count.cc|TranslationModel/UG/util|LM/oxlm|LM/Rand.h|LM/Rand.cpp|TranslationModel/CompactPT|LM/NeuralLMWrapper.cpp|FF/PhraseLengthFeatureTest.cpp|PhraseLengthFeatureTest.cpp|LM/BackwardTest.cpp|LM/BackwardLMState.h|LM/BackwardLMState.cpp|LM/Backward.h|LM/Backward.cpp|FeatureVectorTest.cpp|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|src/SyntacticLanguageModelState.h|src/SyntacticLanguageModelFiles.h|src/SyntacticLanguageModel.h|src/SyntacticLanguageModel.cpp|src/LM/SRI.h|src/LM/SRI.cpp|src/LM/Rand.h|src/LM/Rand.cpp|src/LM/LDHT.h|src/LM/LDHT.cpp|SyntacticLanguageModelState.h|SyntacticLanguageModelFiles.h|SyntacticLanguageModel.h|SyntacticLanguageModel.cpp|LM/LDHT.h|LM/LDHT.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
@@ -151,13 +183,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.401150096" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -203,12 +235,12 @@
</externalSetting>
</externalSettings>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -248,7 +280,7 @@
<listOptionValue builtIn="false" value="LM_IRST"/>
<listOptionValue builtIn="false" value="LM_DALM"/>
<listOptionValue builtIn="false" value="LM_NPLM"/>
- <listOptionValue builtIn="false" value="LM_LBL"/>
+ <listOptionValue builtIn="false" value="LM_OXLM"/>
<listOptionValue builtIn="false" value="_FILE_OFFSET_BIT=64"/>
<listOptionValue builtIn="false" value="_LARGE_FILES"/>
</option>
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index 2c8699956..ef2a032bd 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -432,6 +432,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/HypothesisStackNormal.h</locationURI>
</link>
<link>
+ <name>IOWrapper.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/IOWrapper.cpp</locationURI>
+ </link>
+ <link>
+ <name>IOWrapper.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/IOWrapper.h</locationURI>
+ </link>
+ <link>
<name>Incremental.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Incremental.cpp</locationURI>
@@ -777,6 +787,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/SyntacticLanguageModelState.h</locationURI>
</link>
<link>
+ <name>Syntax</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>TargetPhrase.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TargetPhrase.cpp</locationURI>
@@ -897,6 +912,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationOptionList.h</locationURI>
</link>
<link>
+ <name>TranslationTask.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationTask.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationTask.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationTask.h</locationURI>
+ </link>
+ <link>
<name>TreeInput.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TreeInput.cpp</locationURI>
@@ -1027,6 +1052,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/hypergraph.proto</locationURI>
</link>
<link>
+ <name>mbr.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/mbr.cpp</locationURI>
+ </link>
+ <link>
+ <name>mbr.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/mbr.h</locationURI>
+ </link>
+ <link>
<name>rule.proto</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/rule.proto</locationURI>
@@ -1501,6 +1536,11 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
+ <link>
+ <name>FF/bilingual-lm</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
<link>
<name>LM/Backward.cpp</name>
<type>1</type>
@@ -1707,6 +1747,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/backward.arpa</locationURI>
</link>
<link>
+ <name>LM/bilingual-lm</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>LM/oxlm</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
@@ -1787,6 +1832,126 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TreeStructurePhraseProperty.h</locationURI>
</link>
<link>
+ <name>Syntax/BoundedPriorityContainer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/BoundedPriorityContainer.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/Cube.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/Cube.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/Cube.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/Cube.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/CubeQueue.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/CubeQueue.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/CubeQueue.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/CubeQueue.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/KBestExtractor.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/KBestExtractor.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/KBestExtractor.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/KBestExtractor.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/NonTerminalMap.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/NonTerminalMap.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/PHyperedge.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/PHyperedge.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/PVertex.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/PVertex.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/RuleTable.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/RuleTable.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/RuleTableFF.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/RuleTableFF.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/RuleTableFF.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/RuleTableFF.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SHyperedge.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SHyperedge.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SHyperedge.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SHyperedge.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SHyperedgeBundle.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SHyperedgeBundle.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SHyperedgeBundleScorer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SHyperedgeBundleScorer.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SVertex.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SVertex.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SVertex.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SVertex.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SVertexBeam.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SVertexBeam.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SVertexRecombinationOrderer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SVertexRecombinationOrderer.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SymbolEqualityPred.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SymbolEqualityPred.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SymbolHasher.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SymbolHasher.h</locationURI>
+ </link>
+ <link>
<name>TranslationModel/BilingualDynSuffixArray.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/BilingualDynSuffixArray.cpp</locationURI>
@@ -2062,34 +2227,159 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/osmHyp.h</locationURI>
</link>
<link>
+ <name>LM/bilingual-lm/BilingualLM.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-1-ECLIPSE_HOME/github/mosesdecoder/moses/LM/bilingual-lm/BilingualLM.cpp</locationURI>
+ </link>
+ <link>
+ <name>LM/bilingual-lm/BilingualLM.h</name>
+ <type>1</type>
+ <locationURI>PARENT-1-ECLIPSE_HOME/github/mosesdecoder/moses/LM/bilingual-lm/BilingualLM.h</locationURI>
+ </link>
+ <link>
<name>FF/extract-ghkm/PhraseOrientation.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.cpp</locationURI>
+ <locationURI>PARENT-1-ECLIPSE_HOME/github/mosesdecoder/moses/FF/extract-ghkm/PhraseOrientation.cpp</locationURI>
</link>
<link>
<name>FF/extract-ghkm/PhraseOrientation.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.h</locationURI>
+ <locationURI>PARENT-1-ECLIPSE_HOME/github/mosesdecoder/moses/FF/extract-ghkm/PhraseOrientation.h</locationURI>
</link>
<link>
- <name>LM/oxlm/LBLLM.cpp</name>
+ <name>LM/oxlm/OxLM.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/LBLLM.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLM.cpp</locationURI>
</link>
<link>
- <name>LM/oxlm/LBLLM.h</name>
+ <name>LM/oxlm/OxLM.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/LBLLM.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLM.h</locationURI>
</link>
<link>
- <name>LM/oxlm/Mapper.cpp</name>
+ <name>LM/oxlm/OxLMMapper.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/Mapper.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLMMapper.cpp</locationURI>
</link>
<link>
- <name>LM/oxlm/Mapper.h</name>
+ <name>LM/oxlm/OxLMMapper.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/Mapper.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLMMapper.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/DerivationWriter.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/DerivationWriter.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/DerivationWriter.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/DerivationWriter.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Manager-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Manager-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Manager.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Manager.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/OovHandler-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/OovHandler-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/OovHandler.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/OovHandler.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/PChart.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/PChart.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/PChart.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/PChart.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/PHyperedgeToSHyperedgeBundle.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/ParserCallback.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/ParserCallback.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrie.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrie.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieCYKPlus.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieCYKPlus.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieCYKPlus.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieCYKPlus.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieCreator.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieCreator.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieLoader.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieLoader.cpp</locationURI>
+=======
+ <name>LM/oxlm/OxLM.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLM.cpp</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm/OxLM.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLM.h</locationURI>
+>>>>>>> Rename LBLLM -> OxLM.
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieLoader.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieLoader.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieScope3.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieScope3.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieScope3.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieScope3.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/SChart.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/SChart.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/SChart.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/SChart.h</locationURI>
</link>
<link>
<name>TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp</name>
@@ -2867,6 +3157,21 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/KenOSM.h</locationURI>
</link>
<link>
+ <name>Syntax/S2T/Parsers/Parser.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Parser.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/RecursiveCYKPlusParser</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>TranslationModel/CompactPT/bin/gcc-4.7</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
@@ -3237,6 +3542,76 @@
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
+ <name>Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/Parser.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/TailLattice.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h</locationURI>
+ </link>
+ <link>
<name>TranslationModel/CompactPT/bin/gcc-4.7/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
diff --git a/contrib/other-builds/score/.cproject b/contrib/other-builds/score/.cproject
index 15d939f3f..044fad896 100644
--- a/contrib/other-builds/score/.cproject
+++ b/contrib/other-builds/score/.cproject
@@ -5,12 +5,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.852684782" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -18,7 +18,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.852684782." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.628760407" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.40031730" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/score}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1494414913" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/score}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1494414913" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1369030665" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1299858559" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1103483066" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -60,6 +60,7 @@
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="bz2"/>
<listOptionValue builtIn="false" value="dl"/>
+ <listOptionValue builtIn="false" value="rt"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.202044854" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
@@ -79,12 +80,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1878418244" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/server/.cproject
index e244f8ac8..d971684d8 100644
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/server/.cproject
@@ -1,88 +1,90 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.162355801">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.162355801" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.1015532240">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1015532240" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.162355801" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
- <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
- <tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
- <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1584931166" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.65842083" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
- <listOptionValue builtIn="false" value="/opt/local/include/"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../cmph/include&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.1015532240" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1015532240." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1201298107" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.2097807873" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/server}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.857185882" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.142173353" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1657626940" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.269939241" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1769920565" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.649991225" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../xmlrpc-c/include&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost&quot;"/>
</option>
- <option id="gnu.cpp.compiler.option.preprocessor.def.1785368241" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
- <listOptionValue builtIn="false" value="HAVE_BOOST"/>
- <listOptionValue builtIn="false" value="TRACE_ENABLE"/>
- <listOptionValue builtIn="false" value="KENLM_MAX_ORDER=7"/>
- <listOptionValue builtIn="false" value="WITH_THREADS"/>
+ <option id="gnu.cpp.compiler.option.preprocessor.def.2063944336" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="MAX_NUM_FACTORS=4"/>
+ <listOptionValue builtIn="false" value="WITH_THREADS"/>
+ <listOptionValue builtIn="false" value="KENLM_MAX_ORDER=7"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1402496521" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.603240279" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.827478809" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.1840610682" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.exe.debug.option.debugging.level.1437095112" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.128236233" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.165185265" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.502789927" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.1365428538" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.836267531" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.755343734" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
- <option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../nplm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../probingPT/helpers&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../cmph/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1867046221" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1443553047" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.paths.1096041402" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../xmlrpc-c/lib&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/search/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/OnDiskPt/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/search/Debug&quot;"/>
- <listOptionValue builtIn="false" value="/opt/local/lib"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
</option>
- <option id="gnu.cpp.link.option.libs.1177721357" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <option id="gnu.cpp.link.option.libs.1087215166" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="moses"/>
<listOptionValue builtIn="false" value="search"/>
<listOptionValue builtIn="false" value="OnDiskPt"/>
<listOptionValue builtIn="false" value="lm"/>
<listOptionValue builtIn="false" value="util"/>
- <listOptionValue builtIn="false" value="boost_iostreams"/>
+ <listOptionValue builtIn="false" value="xmlrpc_server_abyss++"/>
+ <listOptionValue builtIn="false" value="xmlrpc_server++"/>
+ <listOptionValue builtIn="false" value="xmlrpc_server_abyss"/>
+ <listOptionValue builtIn="false" value="xmlrpc_server"/>
+ <listOptionValue builtIn="false" value="xmlrpc_abyss"/>
+ <listOptionValue builtIn="false" value="xmlrpc++ "/>
+ <listOptionValue builtIn="false" value="xmlrpc"/>
+ <listOptionValue builtIn="false" value="xmlrpc_util"/>
+ <listOptionValue builtIn="false" value="xmlrpc_xmlparse"/>
+ <listOptionValue builtIn="false" value="xmlrpc_xmltok"/>
+ <listOptionValue builtIn="false" value="pthread"/>
<listOptionValue builtIn="false" value="boost_serialization"/>
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
<listOptionValue builtIn="false" value="boost_system"/>
<listOptionValue builtIn="false" value="boost_thread"/>
<listOptionValue builtIn="false" value="boost_filesystem"/>
- <listOptionValue builtIn="false" value="pthread"/>
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="bz2"/>
<listOptionValue builtIn="false" value="dl"/>
+ <listOptionValue builtIn="false" value="rt"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.128214028" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.308755092" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1267270542" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.612723114" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.784062133" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1514675611" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -90,45 +92,44 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.516628324">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.516628324" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.179761083">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.179761083" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.516628324" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.exe.release.516628324." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1782680519" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
- <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.release.587667692" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
- <builder buildPath="${workspace_loc:/moses-chart-cmd/Release}" id="cdt.managedbuild.target.gnu.builder.exe.release.330540300" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
- <tool id="cdt.managedbuild.tool.gnu.archiver.base.1062976385" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1344864210" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
- <option id="gnu.cpp.compiler.exe.release.option.optimization.level.1422341509" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.exe.release.option.debugging.level.1573362644" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1937178483" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.179761083" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.179761083." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.2024222442" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1098252145" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/server}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.24884855" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1561001393" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1260095073" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.824342210" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.620231073" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.372465520" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1116405938" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.32856289" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.exe.release.option.debugging.level.1235489953" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1583852187" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1635883096" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.74859509" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.1604502606" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.624155660" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.1007421110" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.195880914" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.518921609" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.727800742" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.1586891175" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1588265513" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.330494310" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1407747418" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.727000276" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.665044877" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -138,32 +139,31 @@
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="moses-chart-cmd.cdt.managedbuild.target.gnu.exe.532411209" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
+ <project id="server.cdt.managedbuild.target.gnu.exe.580879474" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.516628324;cdt.managedbuild.config.gnu.exe.release.516628324.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1116405938;cdt.managedbuild.tool.gnu.c.compiler.input.1583852187">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.179761083;cdt.managedbuild.config.gnu.exe.release.179761083.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1635883096;cdt.managedbuild.tool.gnu.c.compiler.input.624155660">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.162355801;cdt.managedbuild.config.gnu.exe.debug.162355801.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.827478809;cdt.managedbuild.tool.gnu.c.compiler.input.128236233">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.179761083;cdt.managedbuild.config.gnu.exe.release.179761083.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1260095073;cdt.managedbuild.tool.gnu.cpp.compiler.input.372465520">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.162355801;cdt.managedbuild.config.gnu.exe.debug.162355801.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480;cdt.managedbuild.tool.gnu.cpp.compiler.input.1402496521">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1015532240;cdt.managedbuild.config.gnu.exe.debug.1015532240.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1657626940;cdt.managedbuild.tool.gnu.cpp.compiler.input.603240279">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.516628324;cdt.managedbuild.config.gnu.exe.release.516628324.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1344864210;cdt.managedbuild.tool.gnu.cpp.compiler.input.1937178483">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1015532240;cdt.managedbuild.config.gnu.exe.debug.1015532240.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.165185265;cdt.managedbuild.tool.gnu.c.compiler.input.836267531">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
- <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+ <resource resourceType="PROJECT" workspacePath="/server"/>
</configuration>
<configuration configurationName="Debug">
- <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+ <resource resourceType="PROJECT" workspacePath="/server"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
- <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>
diff --git a/contrib/other-builds/server/.project b/contrib/other-builds/server/.project
new file mode 100644
index 000000000..fd9ccc290
--- /dev/null
+++ b/contrib/other-builds/server/.project
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>server</name>
+ <comment></comment>
+ <projects>
+ <project>lm</project>
+ <project>moses</project>
+ <project>OnDiskPt</project>
+ <project>search</project>
+ <project>util</project>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+ <triggers>clean,full,incremental,</triggers>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ <buildCommand>
+ <name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+ <triggers>full,incremental,</triggers>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.cdt.core.cnature</nature>
+ <nature>org.eclipse.cdt.core.ccnature</nature>
+ <nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+ <nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+ </natures>
+ <linkedResources>
+ <link>
+ <name>mosesserver.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-2-PROJECT_LOC/server/mosesserver.cpp</locationURI>
+ </link>
+ </linkedResources>
+</projectDescription>
diff --git a/contrib/rephraser/Jamfile b/contrib/rephraser/Jamfile
new file mode 100644
index 000000000..4d868ddf4
--- /dev/null
+++ b/contrib/rephraser/Jamfile
@@ -0,0 +1 @@
+exe paraphrase : paraphrase.cpp ../../moses//moses ../..//boost_program_options ;
diff --git a/contrib/rephraser/paraphrase.cpp b/contrib/rephraser/paraphrase.cpp
new file mode 100644
index 000000000..ad9dbc891
--- /dev/null
+++ b/contrib/rephraser/paraphrase.cpp
@@ -0,0 +1,148 @@
+// $Id$
+// vim:tabstop=2
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+
+/**
+ * Compute paraphrases from the phrase table
+**/
+#include <cmath>
+#include <iostream>
+#include <map>
+
+#include <boost/program_options.hpp>
+
+#include "util/double-conversion/double-conversion.h"
+#include "util/exception.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+//using namespace Moses;
+using namespace std;
+
+namespace po = boost::program_options;
+
+typedef multimap<float,string> Probs;
+
+static float threshold = 1e-04;
+static size_t maxE = 10000; //histogram pruning
+
+static void add(const string& e, const vector<float> scores,
+ Probs& p_e_given_f, Probs& p_f_given_e) {
+ if (scores[0] > threshold) {
+ p_f_given_e.insert(pair<float,string>(scores[0],e));
+ }
+ while(p_f_given_e.size() > maxE) p_f_given_e.erase(p_f_given_e.begin());
+ if (scores[2] > threshold) {
+ p_e_given_f.insert(pair<float,string>(scores[2],e));
+ }
+ while(p_e_given_f.size() > maxE) p_e_given_f.erase(p_e_given_f.begin());
+}
+
+static void finalise(Probs& p_e_given_f, Probs& p_f_given_e) {
+ //cerr << "Sizes: p(e|f): " << p_e_given_f.size() << " p(f|e): " << p_f_given_e.size() << endl;
+ for (Probs::const_iterator e1_iter = p_f_given_e.begin() ;
+ e1_iter != p_f_given_e.end(); ++e1_iter) {
+ for (Probs::const_iterator e2_iter = p_e_given_f.begin() ;
+ e2_iter != p_e_given_f.end(); ++e2_iter) {
+
+ if (e1_iter->second == e2_iter->second) continue;
+ cout << e1_iter->second << " ||| " << e2_iter->second << " ||| " <<
+ e1_iter->first * e2_iter->first << " ||| " << endl;
+ }
+ }
+ p_e_given_f.clear();
+ p_f_given_e.clear();
+}
+
+int main(int argc, char** argv) {
+
+ string input_file;
+
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("help,h", "Print help message and exit")
+ ("threshold,t", po::value<float>(&threshold), "Threshold for p(e|f) and p(f|e)")
+ ("max-target,m", po::value<size_t>(&maxE), "Maximum number of target phrases")
+ ("input-file", po::value<string>(&input_file)->required(), "Input phrase table")
+ ;
+
+ po::positional_options_description pos;
+ pos.add("input-file",1);
+
+ po::variables_map vm;
+ po::store(po::command_line_parser(argc,argv).options(desc).positional(pos).run(), vm);
+
+
+ if (vm.count("help")) {
+ cerr << "Usage: " << string(argv[0]) + " [options] input-file" << endl;
+ cerr << desc << endl;
+ return 0;
+ }
+
+ po::notify(vm);
+
+
+ cerr << "Reading from " << input_file << endl;
+ util::FilePiece in(input_file.c_str(), &std::cerr);
+ vector<float> scoreVector;
+ StringPiece line;
+ double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
+
+
+ string previousSourcePhrase;
+ Probs p_f_given_e_table;
+ Probs p_e_given_f_table;
+
+ size_t count = 0;
+ while(true) {
+ try {
+ line = in.ReadLine();
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
+ ++count;
+
+ util::TokenIter<util::MultiCharacter> pipes(line, " ||| ");
+ StringPiece sourcePhrase(*pipes);
+ StringPiece targetPhrase(*++pipes);
+ StringPiece scoreString(*++pipes);
+ scoreVector.clear();
+ for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+ int processed;
+ float score = converter.StringToFloat(s->data(), s->length(), &processed);
+ UTIL_THROW_IF2(isnan(score), "Bad score " << *s << " on line " << count);
+ scoreVector.push_back(score);
+ }
+
+ if (sourcePhrase.size() && sourcePhrase != previousSourcePhrase) {
+ finalise(p_e_given_f_table, p_f_given_e_table);
+ }
+ add(targetPhrase.as_string(),scoreVector, p_e_given_f_table, p_f_given_e_table);
+ previousSourcePhrase = sourcePhrase.as_string();
+ }
+ finalise(p_e_given_f_table, p_f_given_e_table);
+
+
+
+ return 0;
+}
diff --git a/contrib/server/Jamfile b/contrib/server/Jamfile
index 6ab2590ea..a18a31cb0 100644
--- a/contrib/server/Jamfile
+++ b/contrib/server/Jamfile
@@ -39,7 +39,7 @@ if $(build-moses-server) = true
xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --libs" ] ;
xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --cflags" ] ;
- exe mosesserver : mosesserver.cpp ../../moses//moses ../../OnDiskPt//OnDiskPt ../../moses-cmd/IOWrapper.cpp ../..//boost_filesystem : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
+ exe mosesserver : mosesserver.cpp ../../moses//moses ../../OnDiskPt//OnDiskPt ../..//boost_filesystem : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
} else {
alias mosesserver ;
}
diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index ae697b7c6..f20bab592 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -17,7 +17,7 @@
#endif
#include "moses/TreeInput.h"
#include "moses/LM/ORLM.h"
-#include "moses-cmd/IOWrapper.h"
+#include "moses/IOWrapper.h"
#ifdef WITH_THREADS
#include <boost/thread.hpp>
@@ -28,7 +28,6 @@
#include <xmlrpc-c/server_abyss.hpp>
using namespace Moses;
-using namespace MosesCmd;
using namespace std;
typedef std::map<std::string, xmlrpc_c::value> params_t;
@@ -281,7 +280,7 @@ public:
inputFactorOrder = staticData.GetInputFactorOrder();
stringstream in(source + "\n");
tinput.Read(in,inputFactorOrder);
- ChartManager manager(0,tinput);
+ ChartManager manager(tinput);
manager.ProcessSentence();
const ChartHypothesis *hypo = manager.GetBestHypothesis();
outputChartHypo(out,hypo);
@@ -292,13 +291,15 @@ public:
m_retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_string(sgstream.str())));
}
} else {
+ size_t lineNumber = 0; // TODO: Include sentence request number here?
Sentence sentence;
+ sentence.SetTranslationId(lineNumber);
+
const vector<FactorType> &
inputFactorOrder = staticData.GetInputFactorOrder();
stringstream in(source + "\n");
sentence.Read(in,inputFactorOrder);
- size_t lineNumber = 0; // TODO: Include sentence request number here?
- Manager manager(lineNumber, sentence, staticData.GetSearchAlgorithm());
+ Manager manager(sentence, staticData.GetSearchAlgorithm());
manager.ProcessSentence();
const Hypothesis* hypo = manager.GetBestHypothesis();
@@ -309,7 +310,7 @@ public:
}
if (addWordAlignInfo) {
stringstream wordAlignment;
- OutputAlignment(wordAlignment, hypo);
+ IOWrapper::OutputAlignment(wordAlignment, hypo);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair) {
@@ -471,7 +472,7 @@ public:
if ((int)edges.size() > 0) {
stringstream wordAlignment;
- OutputAlignment(wordAlignment, edges[0]);
+ IOWrapper::OutputAlignment(wordAlignment, edges[0]);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair) {
@@ -489,7 +490,7 @@ public:
{
// should the score breakdown be reported in a more structured manner?
ostringstream buf;
- MosesCmd::OutputAllFeatureScores(path.GetScoreBreakdown(),buf);
+ IOWrapper::OutputAllFeatureScores(path.GetScoreBreakdown(),buf);
nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
}
diff --git a/contrib/sigtest-filter/Makefile b/contrib/sigtest-filter/Makefile
index 71de9c45f..55772929a 100644
--- a/contrib/sigtest-filter/Makefile
+++ b/contrib/sigtest-filter/Makefile
@@ -7,4 +7,4 @@ all: filter-pt
filter-pt: filter-pt.cpp
./check-install $(SALMDIR)
- $(CXX) -O6 $(INC) $(OBJS) -o filter-pt filter-pt.cpp
+ $(CXX) -O6 $(INC) $(OBJS) -o filter-pt filter-pt.cpp -lboost_thread -lboost_system -lpthread -lrt
diff --git a/contrib/sigtest-filter/filter-pt.cpp b/contrib/sigtest-filter/filter-pt.cpp
index 6ab1a5657..bd0b9ae36 100644
--- a/contrib/sigtest-filter/filter-pt.cpp
+++ b/contrib/sigtest-filter/filter-pt.cpp
@@ -4,6 +4,8 @@
#include <cstdio>
#include <cstdlib>
#include <algorithm>
+#include <fstream>
+#include <sstream>
#include "_SuffixArraySearchApplicationBase.h"
@@ -11,18 +13,16 @@
#include <iostream>
#include <set>
+#include <boost/thread/tss.hpp>
+#include <boost/thread.hpp>
+#include <boost/unordered_map.hpp>
+
#ifdef WIN32
#include "WIN32_functions.h"
#else
#include <unistd.h>
#endif
-typedef std::vector<TextLenType> SentIdSet;
-typedef std::pair<SentIdSet, clock_t> ClockedSentIdSet;
-typedef std::map<std::string, ClockedSentIdSet> PhraseSetMap;
-
-#undef min
-
// constants
const size_t MINIMUM_SIZE_TO_KEEP = 10000; // increase this to improve memory usage,
// reduce for speed
@@ -39,12 +39,9 @@ double sig_filter_limit = 0; // keep phrase pairs with -log(sig) > si
// higher = filter-more
bool pef_filter_only = false; // only filter based on pef
bool hierarchical = false;
-int max_cache = 0;
-// globals
-PhraseSetMap esets;
-PhraseSetMap fsets;
double p_111 = 0.0; // alpha
+size_t pt_lines = 0;
size_t nremoved_sigfilter = 0;
size_t nremoved_pfefilter = 0;
@@ -52,6 +49,69 @@ C_SuffixArraySearchApplicationBase e_sa;
C_SuffixArraySearchApplicationBase f_sa;
int num_lines;
+boost::mutex in_mutex;
+boost::mutex out_mutex;
+boost::mutex err_mutex;
+
+typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet;
+
+class Cache {
+ typedef std::pair<SentIdSet, clock_t> ClockedSet;
+ typedef boost::unordered_map<std::string, ClockedSet> ClockedMap;
+
+ public:
+
+ SentIdSet get(const std::string& phrase) {
+ boost::shared_lock<boost::shared_mutex> lock(m_mutex);
+ if(m_cont.count(phrase)) {
+ ClockedSet& set = m_cont[phrase];
+ set.second = clock();
+ return set.first;
+ }
+ return SentIdSet( new SentIdSet::element_type() );
+ }
+
+ void put(const std::string& phrase, const SentIdSet set) {
+ boost::unique_lock<boost::shared_mutex> lock(m_mutex);
+ m_cont[phrase] = std::make_pair(set, clock());
+ }
+
+ static void set_max_cache(size_t max_cache) {
+ s_max_cache = max_cache;
+ }
+
+ void prune() {
+ if(s_max_cache > 0) {
+ boost::upgrade_lock<boost::shared_mutex> lock(m_mutex);
+ if(m_cont.size() > s_max_cache) {
+ std::vector<clock_t> clocks;
+ for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
+ clocks.push_back(it->second.second);
+
+ std::sort(clocks.begin(), clocks.end());
+ clock_t out = clocks[m_cont.size() - s_max_cache];
+
+ boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock);
+ for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
+ if(it->second.second < out)
+ m_cont.erase(it);
+ }
+ }
+ }
+
+ private:
+ ClockedMap m_cont;
+ boost::shared_mutex m_mutex;
+ static size_t s_max_cache;
+};
+
+size_t Cache::s_max_cache = 0;
+
+Cache f_cache;
+Cache e_cache;
+
+#undef min
+
void usage()
{
std::cerr << "\nFilter phrase table using significance testing as described\n"
@@ -59,12 +119,13 @@ void usage()
<< "by Discarding Most of the Phrasetable. EMNLP 2007.\n"
<< "\nUsage:\n"
<< "\n filter-pt -e english.suf-arr -f french.suf-arr\n"
- << " [-c] [-p] [-l threshold] [-n num] < PHRASE-TABLE > FILTERED-PHRASE-TABLE\n\n"
+ << " [-c] [-p] [-l threshold] [-n num] [-t num] < PHRASE-TABLE > FILTERED-PHRASE-TABLE\n\n"
<< " [-l threshold] >0.0, a+e, or a-e: keep values that have a -log significance > this\n"
<< " [-n num ] 0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements\n"
<< " [-c ] add the cooccurence counts to the phrase table\n"
<< " [-p ] add -log(significance) to the phrasetable\n"
<< " [-h ] filter hierarchical rule table\n"
+ << " [-t num ] use num threads\n"
<< " [-m num ] limit cache to num most recent phrases\n";
exit(1);
}
@@ -133,9 +194,6 @@ PTEntry::PTEntry(const std::string& str, int index) :
*fp++=0;
this->pfe = atof(f);
-
- // std::cerr << "L: " << f_phrase << " ::: " << e_phrase << " ::: " << scores << " ::: " << pfe << std::endl;
- // std::cerr << "X: " << extra << "\n";
}
struct PfeComparer {
@@ -168,7 +226,8 @@ std::ostream& operator << (std::ostream& os, const PTEntry& pp)
void print(int a, int b, int c, int d, float p)
{
std::cerr << a << "\t" << b << "\t P=" << p << "\n"
- << c << "\t" << d << "\t xf=" << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
+ << c << "\t" << d << "\t xf="
+ << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
}
// 2x2 (one-sided) Fisher's exact test
@@ -184,13 +243,13 @@ double fisher_exact(int cfe, int ce, int cf)
int d = (num_lines - ce - cf + cfe);
int n = a + b + c + d;
- double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d) - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c) - lgamma(1+d));
+ double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d)
+ - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c)
+ - lgamma(1+d));
double total_p = 0.0;
int tc = std::min(b,c);
for (int i=0; i<=tc; i++) {
total_p += cp;
-// double lg = lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d) - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c) - lgamma(1+d); double cp = exp(lg);
-// print(a,b,c,d,cp);
double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1);
cp *= coef;
++a;
@@ -202,86 +261,73 @@ double fisher_exact(int cfe, int ce, int cf)
}
template <class setType>
-setType ordered_set_intersect(setType & set_1, setType & set_2)
+void ordered_set_intersect(setType& out, const setType set_1, const setType set_2)
{
- setType set_out;
- std::set_intersection(set_1.begin(), set_1.end(), set_2.begin(), set_2.end(), inserter(set_out,set_out.begin()) );
- return set_out;
+ std::set_intersection(set_1->begin(), set_1->end(), set_2->begin(),
+ set_2->end(), inserter(*out, out->begin()) );
}
-SentIdSet lookup_phrase(const std::string & phrase, C_SuffixArraySearchApplicationBase & my_sa)
+void lookup_phrase(SentIdSet& ids, const std::string& phrase,
+ C_SuffixArraySearchApplicationBase & my_sa, Cache& cache)
{
- SentIdSet occur_set;
- vector<S_SimplePhraseLocationElement> locations;
-
- locations = my_sa.locateExactPhraseInCorpus(phrase.c_str());
- if(locations.size()==0) {
- cerr<<"No occurrences found!!\n";
- }
- for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i != locations.end(); ++i) {
- occur_set.push_back(i->sentIdInCorpus);
+ ids = cache.get(phrase);
+ if(ids->empty()) {
+ vector<S_SimplePhraseLocationElement> locations;
+ locations = my_sa.locateExactPhraseInCorpus(phrase.c_str());
+ if(locations.size()==0) {
+ cerr<<"No occurrences found!!\n";
+ }
+ for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin();
+ i != locations.end(); ++i) {
+ ids->push_back(i->sentIdInCorpus);
+ }
+
+ std::sort(ids->begin(), ids->end());
+ SentIdSet::element_type::iterator it =
+ std::unique(ids->begin(), ids->end());
+ ids->resize(it - ids->begin());
+
+ if(ids->size() >= MINIMUM_SIZE_TO_KEEP)
+ cache.put(phrase, ids);
}
-
- std::sort(occur_set.begin(), occur_set.end());
- SentIdSet::iterator it = std::unique(occur_set.begin(), occur_set.end());
- occur_set.resize(it - occur_set.begin());
-
- return occur_set;
}
-
-// slight simplicifaction: we consider all sentences in which "a" and "b" occur to be instances of the rule "a [X][X] b".
-SentIdSet lookup_multiple_phrases(vector<std::string> & phrases, C_SuffixArraySearchApplicationBase & my_sa, const std::string & rule, PhraseSetMap & cache)
-{
+void lookup_multiple_phrases(SentIdSet& ids, vector<std::string> & phrases,
+ C_SuffixArraySearchApplicationBase & my_sa,
+ const std::string & rule, Cache& cache)
+{
if (phrases.size() == 1) {
- return lookup_phrase(phrases.front(), my_sa);
+ lookup_phrase(ids, phrases.front(), my_sa, cache);
}
-
else {
- SentIdSet main_set;
- ClockedSentIdSet & clocked_first_set = cache[phrases.front()];
- SentIdSet & first_set = clocked_first_set.first;
- clocked_first_set.second = clock();
-
+ SentIdSet main_set( new SentIdSet::element_type() );
bool first = true;
- if (first_set.empty()) {
- first_set = lookup_phrase(phrases.front(), my_sa);
- }
- for (vector<std::string>::iterator phrase=phrases.begin()+1; phrase != phrases.end(); ++phrase) {
- ClockedSentIdSet & clocked_temp_set = cache[*phrase];
- SentIdSet & temp_set = clocked_temp_set.first;
- clocked_temp_set.second = clock();
-
- if (temp_set.empty()) {
- temp_set = lookup_phrase(*phrase, my_sa);
- }
+ SentIdSet first_set( new SentIdSet::element_type() );
+ lookup_phrase(first_set, phrases.front(), my_sa, cache);
+ for (vector<std::string>::iterator phrase=phrases.begin()+1;
+ phrase != phrases.end(); ++phrase) {
+ SentIdSet temp_set( new SentIdSet::element_type() );
+ lookup_phrase(temp_set, *phrase, my_sa, cache);
if (first) {
- main_set = ordered_set_intersect(first_set,temp_set);
+ ordered_set_intersect(main_set, first_set, temp_set);
first = false;
}
else {
- main_set = ordered_set_intersect(main_set,temp_set);
- }
- if (temp_set.size() < MINIMUM_SIZE_TO_KEEP) {
- cache.erase(*phrase);
+ SentIdSet new_set( new SentIdSet::element_type() );
+ ordered_set_intersect(new_set, main_set, temp_set);
+ main_set->swap(*new_set);
}
}
-
- if (first_set.size() < MINIMUM_SIZE_TO_KEEP) {
- cache.erase(phrases.front());
- }
-
- return main_set;
+ ids->swap(*main_set);
}
}
-SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicationBase & my_sa, PhraseSetMap & cache)
+void find_occurrences(SentIdSet& ids, const std::string& rule,
+ C_SuffixArraySearchApplicationBase& my_sa, Cache& cache)
{
- SentIdSet sa_set;
-
// we search for hierarchical rules by stripping away NT and looking for terminals sequences
// if a rule contains multiple sequences of terminals, we intersect their occurrences.
if (hierarchical) {
@@ -305,76 +351,142 @@ SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicati
phrases.push_back(rule.substr(pos,NTStartPos-pos));
}
- sa_set = lookup_multiple_phrases(phrases, my_sa, rule, cache);
+ lookup_multiple_phrases(ids, phrases, my_sa, rule, cache);
}
else {
- sa_set = lookup_phrase(rule, my_sa);
+ lookup_phrase(ids, rule, my_sa, cache);
}
- return sa_set;
}
// input: unordered list of translation options for a single source phrase
-void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
+void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
+ Cache& f_cache, Cache& e_cache)
{
- if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) {
+ if (pfe_filter_limit > 0 && options.size() > pfe_filter_limit) {
nremoved_pfefilter += (options.size() - pfe_filter_limit);
- std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer());
- for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i)
+ std::nth_element(options.begin(), options.begin() + pfe_filter_limit,
+ options.end(), PfeComparer());
+ for (std::vector<PTEntry*>::iterator i = options.begin() + pfe_filter_limit;
+ i != options.end(); ++i)
delete *i;
- options.erase(options.begin()+pfe_filter_limit,options.end());
+ options.erase(options.begin() + pfe_filter_limit,options.end());
}
- if (pef_filter_only) return;
-// std::cerr << "f phrase: " << options.front()->f_phrase << "\n";
- SentIdSet fset;
- fset = find_occurrences(options.front()->f_phrase, f_sa, fsets);
- size_t cf = fset.size();
- for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
+
+ if (pef_filter_only)
+ return;
+
+ if (options.empty())
+ return;
+
+ SentIdSet fset( new SentIdSet::element_type() );
+ find_occurrences(fset, options.front()->f_phrase, f_sa, f_cache);
+ size_t cf = fset->size();
+
+ for (std::vector<PTEntry*>::iterator i = options.begin();
+ i != options.end(); ++i) {
const std::string& e_phrase = (*i)->e_phrase;
- size_t cef=0;
- ClockedSentIdSet& clocked_eset = esets[e_phrase];
- SentIdSet & eset = clocked_eset.first;
- clocked_eset.second = clock();
- if (eset.empty()) {
- eset = find_occurrences(e_phrase, e_sa, esets);
- //std::cerr << "Looking up e-phrase: " << e_phrase << "\n";
- }
- size_t ce=eset.size();
- if (ce < cf) {
- for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) {
- if (std::binary_search(fset.begin(), fset.end(), *i)) cef++;
- }
- } else {
- for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) {
- if (std::binary_search(eset.begin(), eset.end(), *i)) cef++;
- }
- }
+ SentIdSet eset( new SentIdSet::element_type() );
+ find_occurrences(eset, e_phrase, e_sa, e_cache);
+ size_t ce = eset->size();
+
+ SentIdSet efset( new SentIdSet::element_type() );
+ ordered_set_intersect(efset, fset, eset);
+ size_t cef = efset->size();
+
double nlp = -log(fisher_exact(cef, cf, ce));
(*i)->set_cooc_stats(cef, cf, ce, nlp);
- if (ce < MINIMUM_SIZE_TO_KEEP) {
- esets.erase(e_phrase);
- }
-
}
+
std::vector<PTEntry*>::iterator new_end =
- std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit));
+ std::remove_if(options.begin(), options.end(),
+ NlogSigThresholder(sig_filter_limit));
nremoved_sigfilter += (options.end() - new_end);
options.erase(new_end,options.end());
}
-void prune_cache(PhraseSetMap & psm) {
- if(max_cache && psm.size() > max_cache) {
- std::vector<clock_t> clocks;
- for(PhraseSetMap::iterator it = psm.begin(); it != psm.end(); it++)
- clocks.push_back(it->second.second);
-
- std::sort(clocks.begin(), clocks.end());
- clock_t out = clocks[psm.size()-max_cache];
+void filter(std::istream* in, std::ostream* out, int pfe_index) {
+
+ std::vector<std::string> lines;
+ std::string prev = "";
+ std::vector<PTEntry*> options;
+ while(true) {
+ {
+ boost::mutex::scoped_lock lock(in_mutex);
+ if(in->eof())
+ break;
+
+ lines.clear();
+ std::string line;
+ while(getline(*in, line) && lines.size() < 500000)
+ lines.push_back(line);
+ }
- for(PhraseSetMap::iterator it = psm.begin(); it != psm.end(); it++)
- if(it->second.second < out)
- psm.erase(it);
+ std::stringstream out_temp;
+ for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) {
+ size_t tmp_lines = ++pt_lines;
+ if(tmp_lines % 10000 == 0) {
+ boost::mutex::scoped_lock lock(err_mutex);
+ std::cerr << ".";
+
+ if(tmp_lines % 500000 == 0)
+ std::cerr << "[n:" << tmp_lines << "]\n";
+
+ if(tmp_lines % 10000000 == 0) {
+ float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
+ float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
+ std::cerr << "------------------------------------------------------\n"
+ << " unfiltered phrases pairs: " << pt_lines << "\n"
+ << "\n"
+ << " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n"
+ << " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n"
+ << " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
+ << "\n"
+ << " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
+ << "------------------------------------------------------\n";
+ }
+ }
+
+ if(pt_lines % 10000 == 0) {
+ f_cache.prune();
+ e_cache.prune();
+ }
+
+ if(it->length() > 0) {
+ PTEntry* pp = new PTEntry(it->c_str(), pfe_index);
+ if (prev != pp->f_phrase) {
+ prev = pp->f_phrase;
+
+ if (!options.empty()) { // always true after first line
+ compute_cooc_stats_and_filter(options, f_cache, e_cache);
+ }
+
+ for (std::vector<PTEntry*>::iterator i = options.begin();
+ i != options.end(); ++i) {
+ out_temp << **i << '\n';
+ delete *i;
+ }
+
+ options.clear();
+ options.push_back(pp);
+
+ } else {
+ options.push_back(pp);
+ }
+ }
+ }
+ boost::mutex::scoped_lock lock(out_mutex);
+ *out << out_temp.str() << std::flush;
+ }
+ compute_cooc_stats_and_filter(options, f_cache, e_cache);
+
+ boost::mutex::scoped_lock lock(out_mutex);
+ for (std::vector<PTEntry*>::iterator i = options.begin();
+ i != options.end(); ++i) {
+ *out << **i << '\n';
+ delete *i;
}
+ *out << std::flush;
}
int main(int argc, char * argv[])
@@ -383,7 +495,9 @@ int main(int argc, char * argv[])
const char* efile=0;
const char* ffile=0;
int pfe_index = 2;
- while ((c = getopt(argc, argv, "cpf:e:i:n:l:m:h")) != -1) {
+ int threads = 1;
+ size_t max_cache = 0;
+ while ((c = getopt(argc, argv, "cpf:e:i:n:t:l:m:h")) != -1) {
switch (c) {
case 'e':
efile = optarg;
@@ -398,6 +512,14 @@ int main(int argc, char * argv[])
pfe_filter_limit = atoi(optarg);
std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
break;
+ case 't':
+ threads = atoi(optarg);
+ std::cerr << "Using threads: " << threads << std::endl;
+ break;
+ case 'm':
+ max_cache = atoi(optarg);
+ std::cerr << "Using max phrases in caches: " << max_cache << std::endl;
+ break;
case 'c':
print_cooc_counts = true;
break;
@@ -407,9 +529,6 @@ int main(int argc, char * argv[])
case 'h':
hierarchical = true;
break;
- case 'm':
- max_cache = atoi(optarg);
- break;
case 'l':
std::cerr << "-l = " << optarg << "\n";
if (strcmp(optarg,"a+e") == 0) {
@@ -429,12 +548,13 @@ int main(int argc, char * argv[])
usage();
}
}
+
if (sig_filter_limit == 0.0) pef_filter_only = true;
//-----------------------------------------------------------------------------
if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
usage();
}
-
+
//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
if (!pef_filter_only) {
e_sa.loadData_forSearch(efile, false, false);
@@ -460,52 +580,17 @@ int main(int argc, char * argv[])
std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl;
}
- char tmpString[10000];
- std::string prev = "";
- std::vector<PTEntry*> options;
- size_t pt_lines = 0;
- while(!cin.eof()) {
- cin.getline(tmpString,10000,'\n');
- if(++pt_lines%10000==0) {
- std::cerr << ".";
-
- prune_cache(esets);
- prune_cache(fsets);
-
- if(pt_lines%500000==0)
- std::cerr << "[n:"<<pt_lines<<"]\n";
- }
+ Cache::set_max_cache(max_cache);
+ std::ios_base::sync_with_stdio(false);
+
+ boost::thread_group threadGroup;
+ for(int i = 0; i < threads; i++)
+ threadGroup.add_thread(new boost::thread(filter, &std::cin, &std::cout, pfe_index));
+ threadGroup.join_all();
- if(strlen(tmpString)>0) {
- PTEntry* pp = new PTEntry(tmpString, pfe_index);
- if (prev != pp->f_phrase) {
- prev = pp->f_phrase;
-
- if (!options.empty()) { // always true after first line
- compute_cooc_stats_and_filter(options);
- }
- for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
- std::cout << **i << std::endl;
- delete *i;
- }
- options.clear();
- options.push_back(pp);
-
- } else {
- options.push_back(pp);
- }
- // for(int i=0;i<locations.size(); i++){
- // cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
- // }
- }
- }
- compute_cooc_stats_and_filter(options);
- for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
- std::cout << **i << std::endl;
- delete *i;
- }
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
+
std::cerr << "\n\n------------------------------------------------------\n"
<< " unfiltered phrases pairs: " << pt_lines << "\n"
<< "\n"
@@ -514,7 +599,5 @@ int main(int argc, char * argv[])
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
<< "\n"
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
- << "------------------------------------------------------\n";
-
- return 0;
+ << "------------------------------------------------------\n";
}
diff --git a/contrib/tmcombine/test/model5/model/lex.counts.e2f b/contrib/tmcombine/test/model5/model/lex.counts.e2f
new file mode 100644
index 000000000..ed05c0b7d
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/lex.counts.e2f
@@ -0,0 +1,8 @@
+ad af 500 1000
+bd bf 5 10
+der le 20285 102586
+der NULL 12926 704917
+gipfel sommet 3485 7322
+pass col 419 2911
+pass passeport 7 28
+sitzung séance 14 59 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model5/model/lex.counts.f2e b/contrib/tmcombine/test/model5/model/lex.counts.f2e
new file mode 100644
index 000000000..ea31f690d
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/lex.counts.f2e
@@ -0,0 +1,8 @@
+af ad 500 1000
+bf bd 5 10
+col pass 419 615
+le der 20285 113635
+passeport pass 7 615
+retrouvé NULL 34 1016136
+séance sitzung 14 33
+sommet gipfel 3485 5700 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model5/model/lex.e2f b/contrib/tmcombine/test/model5/model/lex.e2f
new file mode 100644
index 000000000..f9263ffe5
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/lex.e2f
@@ -0,0 +1,8 @@
+ad af 0.5
+bd bf 0.5
+der le 0.1977365
+der NULL 0.0183369
+gipfel sommet 0.4759629
+pass col 0.1439368
+pass passeport 0.2500000
+sitzung séance 0.2372881 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model5/model/lex.f2e b/contrib/tmcombine/test/model5/model/lex.f2e
new file mode 100644
index 000000000..2bba51f01
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/lex.f2e
@@ -0,0 +1,8 @@
+af ad 0.5
+bf bd 0.5
+col pass 0.6813008
+le der 0.1785101
+passeport pass 0.0113821
+retrouvé NULL 0.0000335
+séance sitzung 0.4242424
+sommet gipfel 0.6114035 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model5/model/phrase-table b/contrib/tmcombine/test/model5/model/phrase-table
new file mode 100644
index 000000000..5621b5acf
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/phrase-table
@@ -0,0 +1,8 @@
+ad [X][X] [X] ||| af [X][X] [X] ||| 0.5 0.5 0.5 0.5 2.718 ||| 0-0 1-1 ||| 1000 1000
+bd [X] ||| bf [X] ||| 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 10 10
+der gipfel [X] ||| sommet [X] ||| 0.00327135 0.00872768 0.0366795 0.611403 2.718 ||| 1-0 ||| 5808 518
+der [X][X] pass [X] ||| le [X][X] col [X] ||| 0.0173565 0.0284616 0.288889 0.121619 2.718 ||| 0-0 1-1 2-2 ||| 749 45
+pass [X] ||| col [X] ||| 0.1952 0.143937 0.628866 0.681301 2.718 ||| 0-0 ||| 1875 582
+pass [X] ||| passeport retrouvé [X] ||| 0.5 0.25 0.00171821 3.813e-07 2.718 ||| 0-0 ||| 2 582
+pass [X] ||| passeport [X] ||| 0.266667 0.25 0.00687285 0.0113821 2.718 ||| 0-0 ||| 15 582
+[X][X] sitzung [X] ||| [X][X] séance [X] ||| 0.272727 0.237288 0.352941 0.424242 2.718 ||| 0-0 1-1 ||| 22 17 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/lex.counts.e2f b/contrib/tmcombine/test/model6/model/lex.counts.e2f
new file mode 100644
index 000000000..8475fcdf9
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/lex.counts.e2f
@@ -0,0 +1,8 @@
+ad af 100 1000
+bd bf 1 10
+der le 150181 944391
+der NULL 54483 3595140
+gipfel sommet 3421 9342
+pass col 2 70
+pass passeport 73 379
+sitzung séance 3441 5753 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/lex.counts.f2e b/contrib/tmcombine/test/model6/model/lex.counts.f2e
new file mode 100644
index 000000000..b0913088a
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/lex.counts.f2e
@@ -0,0 +1,8 @@
+af ad 100 1000
+bf bd 1 10
+col pass 2 108
+le der 150181 1356104
+passeport pass 73 108
+retrouvé NULL 43 6276240
+séance sitzung 3441 6142
+sommet gipfel 3421 4908 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/lex.e2f b/contrib/tmcombine/test/model6/model/lex.e2f
new file mode 100644
index 000000000..b1ce3a613
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/lex.e2f
@@ -0,0 +1,8 @@
+ad af 0.1
+bd bf 0.1
+der le 0.1590242
+der NULL 0.0151546
+gipfel sommet 0.366195
+pass col 0.0285714
+pass passeport 0.1926121
+sitzung séance 0.5981227 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/lex.f2e b/contrib/tmcombine/test/model6/model/lex.f2e
new file mode 100644
index 000000000..d931dcb72
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/lex.f2e
@@ -0,0 +1,8 @@
+af ad 0.1
+bf bd 0.1
+col pass 0.0185185
+le der 0.1107445
+passeport pass 0.6759259
+retrouvé NULL 0.0000069
+séance sitzung 0.5602410
+sommet gipfel 0.6970253 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/phrase-table b/contrib/tmcombine/test/model6/model/phrase-table
new file mode 100644
index 000000000..9c260f171
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/phrase-table
@@ -0,0 +1,5 @@
+ad [X][X] [X] ||| af [X][X] [X] ||| 0.1 0.1 0.1 0.1 2.718 ||| 0-0 1-1 ||| 1000 1000
+bd [X] ||| bf [X] ||| 0.1 0.1 0.1 0.1 2.718 ||| 0-0 ||| 10 10
+der [X][X] pass [X] ||| le [X][X] passeport [X] ||| 0.16 0.03063 0.4 0.0748551 2.718 ||| 0-0 1-1 2-2 ||| 25 10
+pass [X] ||| passeport [X] ||| 0.28022 0.192612 0.607143 0.675926 2.718 ||| 0-0 ||| 182 84
+[X][X] sitzung [X] ||| [X][X] séance [X] ||| 0.784521 0.598123 0.516654 0.560241 2.718 ||| 0-0 1-1 ||| 4251 6455 \ No newline at end of file
diff --git a/contrib/tmcombine/test/phrase-table_test11 b/contrib/tmcombine/test/phrase-table_test11
new file mode 100644
index 000000000..128cf07d9
--- /dev/null
+++ b/contrib/tmcombine/test/phrase-table_test11
@@ -0,0 +1,9 @@
+ad [X][X] [X] ||| af [X][X] [X] ||| 0.14 0.136364 0.18 0.3 ||| 0-0 1-1 ||| 10000.0 5000.0
+bd [X] ||| bf [X] ||| 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 100.0 50.0
+der [X][X] pass [X] ||| le [X][X] passeport [X] ||| 0.16 0.0307772 0.4 0.0128336 ||| 0-0 1-1 2-2 ||| 225.0 40.0
+der gipfel [X] ||| sommet [X] ||| 0.00327135 0.00569336 0.0366795 0.651018 ||| 1-0 ||| 5808.0 518.0
+der [X][X] pass [X] ||| le [X][X] col [X] ||| 0.0173565 0.0193836 0.288889 0.0675369 ||| 0-0 1-1 2-2 ||| 749.0 45.0
+pass [X] ||| col [X] ||| 0.1952 0.121573 0.398693 0.582296 ||| 0-0 ||| 1875.0 918.0
+pass [X] ||| passeport [X] ||| 0.280097 0.193033 0.22658 0.11065 ||| 0-0 ||| 1653.0 918.0
+pass [X] ||| passeport retrouvé [X] ||| 0.5 0.193033 0.00108932 1.16835e-06 ||| 0-0 ||| 2.0 918.0
+[X][X] sitzung [X] ||| [X][X] séance [X] ||| 0.784227 0.597753 0.516546 0.559514 ||| 0-0 1-1 ||| 38281.0 25837.0
diff --git a/contrib/tmcombine/tmcombine.py b/contrib/tmcombine/tmcombine.py
index b512188d7..467a24e19 100755
--- a/contrib/tmcombine/tmcombine.py
+++ b/contrib/tmcombine/tmcombine.py
@@ -1176,6 +1176,9 @@ def compute_lexicalweight(weights,alignment,word_pairs,marginal,mode='counts',ca
mycache[1] = defaultdict(dict)
for x,translations in alignment:
+ # skip nonterminals
+ if x.startswith(b'['):
+ continue
if cache and translations in mycache[1][x]:
lex_step = mycache[1][x][translations]
@@ -1870,7 +1873,12 @@ def test():
sys.stderr.write('Regression test 10\n')
Combiner = Combine_TMs([[os.path.join('test','model3'),'primary'],[os.path.join('test','model4'),'primary']],output_file=os.path.join('test','phrase-table_test10'),mode='counts',number_of_features=8,i_e2f=4,i_e2f_lex=5,i_f2e=6,i_f2e_lex=7,reference_file='test/extract')
Combiner.combine_given_tuning_set()
-
+
+ # count-based combination of two hierarchical models, with fixed weights. Same as test 3, but with hierarchical models
+ # command line: python tmcombine.py combine_given_weights test/model5 test/model6 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test11 -m counts
+ sys.stderr.write('Regression test 11\n')
+ Combiner = Combine_TMs([[os.path.join('test','model5'),'primary'],[os.path.join('test','model6'),'primary']],[[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test11'),mode='counts')
+ Combiner.combine_given_weights()
#convert weight vector passed as a command line argument
class to_list(argparse.Action):
diff --git a/jam-files/sanity.jam b/jam-files/sanity.jam
index 173444ef1..9b4d5873d 100644
--- a/jam-files/sanity.jam
+++ b/jam-files/sanity.jam
@@ -58,7 +58,7 @@ if $(FORCE-STATIC) {
rule test_library ( name ) {
if $(FORCE-STATIC) {
- return [ test_flags "-l$(name) -static" ] ;
+ return [ test_flags "-Wl,-Bstatic -l$(name) -Wl,-Bdynamic" ] ;
} else {
return [ test_flags "-l$(name)" ] ;
}
@@ -88,7 +88,7 @@ rule auto-shared ( name : additional * ) {
if $(shared-command-line) = "<link>shared" {
return "<link>shared" ;
} else {
- if [ test_flags $(additional)" -static -l"$(name) ] {
+ if [ test_flags $(additional)" -Wl,-Bstatic -l"$(name)" -Wl,-Bdynamic" ] {
return ;
} else {
if $(FORCE-STATIC) {
@@ -140,7 +140,7 @@ rule boost-lib ( name macro : deps * ) {
if $(boost-auto-shared) = "<link>shared" {
flags += " -DBOOST_$(macro)" ;
} else {
- flags += " -static" ;
+ flags = " -Wl,-Bstatic $(flags) -Wl,-Bdynamic " ;
}
if [ test_flags $(flags) : $(main) ] {
lib inner_boost_$(name) : : <threading>single $(boost-search) <name>boost_$(name)$(boost-lib-version) : <link>static : <library>$(deps) ;
diff --git a/lm/Jamfile b/lm/Jamfile
index 227b22014..edc3751a7 100644
--- a/lm/Jamfile
+++ b/lm/Jamfile
@@ -14,7 +14,7 @@ update-if-changed $(ORDER-LOG) $(max-order) ;
max-order += <dependency>$(ORDER-LOG) ;
wrappers = ;
-local with-nplm = [ option.get "with-nplm" ] ;
+local with-nplm = [ option.get "with-nplm-0.1" ] ;
if $(with-nplm) {
lib neuralLM : : <search>$(with-nplm)/src ;
obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ;
diff --git a/lm/builder/adjust_counts.cc b/lm/builder/adjust_counts.cc
index 080b438a4..803c557d0 100644
--- a/lm/builder/adjust_counts.cc
+++ b/lm/builder/adjust_counts.cc
@@ -29,28 +29,44 @@ class StatCollector {
~StatCollector() {}
- void CalculateDiscounts() {
+ void CalculateDiscounts(const DiscountConfig &config) {
counts_.resize(orders_.size());
counts_pruned_.resize(orders_.size());
- discounts_.resize(orders_.size());
for (std::size_t i = 0; i < orders_.size(); ++i) {
const OrderStat &s = orders_[i];
counts_[i] = s.count;
counts_pruned_[i] = s.count_pruned;
+ }
- for (unsigned j = 1; j < 4; ++j) {
- // TODO: Specialize error message for j == 3, meaning 3+
- UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for "
- << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any "
- << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?");
- }
-
- // See equation (26) in Chen and Goodman.
- discounts_[i].amount[0] = 0.0;
- float y = static_cast<float>(s.n[1]) / static_cast<float>(s.n[1] + 2.0 * s.n[2]);
- for (unsigned j = 1; j < 4; ++j) {
- discounts_[i].amount[j] = static_cast<float>(j) - static_cast<float>(j + 1) * y * static_cast<float>(s.n[j+1]) / static_cast<float>(s.n[j]);
- UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j]);
+ discounts_ = config.overwrite;
+ discounts_.resize(orders_.size());
+ for (std::size_t i = config.overwrite.size(); i < orders_.size(); ++i) {
+ const OrderStat &s = orders_[i];
+ try {
+ for (unsigned j = 1; j < 4; ++j) {
+ // TODO: Specialize error message for j == 3, meaning 3+
+ UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for "
+ << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any "
+ << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?");
+ }
+
+ // See equation (26) in Chen and Goodman.
+ discounts_[i].amount[0] = 0.0;
+ float y = static_cast<float>(s.n[1]) / static_cast<float>(s.n[1] + 2.0 * s.n[2]);
+ for (unsigned j = 1; j < 4; ++j) {
+ discounts_[i].amount[j] = static_cast<float>(j) - static_cast<float>(j + 1) * y * static_cast<float>(s.n[j+1]) / static_cast<float>(s.n[j]);
+ UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j]);
+ }
+ } catch (const BadDiscountException &e) {
+ switch (config.bad_action) {
+ case THROW_UP:
+ throw;
+ case COMPLAIN:
+ std::cerr << e.what() << " Substituting fallback discounts D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl;
+ case SILENT:
+ break;
+ }
+ discounts_[i] = config.fallback;
}
}
}
@@ -179,7 +195,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
for (NGramStream full(positions[0]); full; ++full)
stats.AddFull(full->Count());
- stats.CalculateDiscounts();
+ stats.CalculateDiscounts(discount_config_);
return;
}
@@ -262,7 +278,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
for (NGramStream *s = streams.begin(); s != streams.end(); ++s)
s->Poison();
- stats.CalculateDiscounts();
+ stats.CalculateDiscounts(discount_config_);
// NOTE: See special early-return case for unigrams near the top of this function
}
diff --git a/lm/builder/adjust_counts.hh b/lm/builder/adjust_counts.hh
index 60198e8f8..a5435c282 100644
--- a/lm/builder/adjust_counts.hh
+++ b/lm/builder/adjust_counts.hh
@@ -2,6 +2,7 @@
#define LM_BUILDER_ADJUST_COUNTS_H
#include "lm/builder/discount.hh"
+#include "lm/lm_exception.hh"
#include "util/exception.hh"
#include <vector>
@@ -19,6 +20,16 @@ class BadDiscountException : public util::Exception {
~BadDiscountException() throw();
};
+struct DiscountConfig {
+ // Overrides discounts for orders [1,discount_override.size()].
+ std::vector<Discount> overwrite;
+ // If discounting fails for an order, copy them from here.
+ Discount fallback;
+ // What to do when discounts are out of range or would trigger divison by
+ // zero. It it does something other than THROW_UP, use fallback_discount.
+ WarningAction bad_action;
+};
+
/* Compute adjusted counts.
* Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
* Output: [1,N]-grams with adjusted counts.
@@ -27,17 +38,28 @@ class BadDiscountException : public util::Exception {
*/
class AdjustCounts {
public:
- AdjustCounts(std::vector<uint64_t> &counts, std::vector<uint64_t> &counts_pruned, std::vector<Discount> &discounts, std::vector<uint64_t> &prune_thresholds)
- : counts_(counts), counts_pruned_(counts_pruned), discounts_(discounts), prune_thresholds_(prune_thresholds)
+ // counts: output
+ // counts_pruned: output
+ // discounts: mostly output. If the input already has entries, they will be kept.
+ // prune_thresholds: input. n-grams with normal (not adjusted) count below this will be pruned.
+ AdjustCounts(
+ const std::vector<uint64_t> &prune_thresholds,
+ std::vector<uint64_t> &counts,
+ std::vector<uint64_t> &counts_pruned,
+ const DiscountConfig &discount_config,
+ std::vector<Discount> &discounts)
+ : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), discount_config_(discount_config), discounts_(discounts)
{}
void Run(const util::stream::ChainPositions &positions);
private:
+ const std::vector<uint64_t> &prune_thresholds_;
std::vector<uint64_t> &counts_;
std::vector<uint64_t> &counts_pruned_;
+
+ DiscountConfig discount_config_;
std::vector<Discount> &discounts_;
- std::vector<uint64_t> &prune_thresholds_;
};
} // namespace builder
diff --git a/lm/builder/adjust_counts_test.cc b/lm/builder/adjust_counts_test.cc
index 9d8ef65b6..073c5dfeb 100644
--- a/lm/builder/adjust_counts_test.cc
+++ b/lm/builder/adjust_counts_test.cc
@@ -75,7 +75,10 @@ BOOST_AUTO_TEST_CASE(Simple) {
chains >> util::stream::kRecycle;
std::vector<uint64_t> counts_pruned(4);
std::vector<uint64_t> prune_thresholds(4);
- BOOST_CHECK_THROW(AdjustCounts(counts, counts_pruned, discount, prune_thresholds).Run(for_adjust), BadDiscountException);
+ DiscountConfig discount_config;
+ discount_config.fallback = Discount();
+ discount_config.bad_action = THROW_UP;
+ BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, discount_config, discount).Run(for_adjust), BadDiscountException);
}
BOOST_REQUIRE_EQUAL(4UL, counts.size());
BOOST_CHECK_EQUAL(4UL, counts[0]);
diff --git a/lm/builder/initial_probabilities.cc b/lm/builder/initial_probabilities.cc
index f6ee334c7..5d19a8973 100644
--- a/lm/builder/initial_probabilities.cc
+++ b/lm/builder/initial_probabilities.cc
@@ -69,9 +69,12 @@ class PruneNGramStream {
block_->SetValidSize(dest_.Base() - block_base);
++block_;
StartBlock();
+ if (block_) {
+ currentCount_ = current_.CutoffCount();
+ }
+ } else {
+ currentCount_ = current_.CutoffCount();
}
-
- currentCount_ = current_.CutoffCount();
return *this;
}
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index 3e1225d9e..a7947a422 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -9,14 +9,66 @@
#include "util/murmur_hash.hh"
#include <assert.h>
+#include <math.h>
namespace lm { namespace builder {
namespace {
-class Callback {
+/* Calculate q, the collapsed probability and backoff, as defined in
+ * @inproceedings{Heafield-rest,
+ * author = {Kenneth Heafield and Philipp Koehn and Alon Lavie},
+ * title = {Language Model Rest Costs and Space-Efficient Storage},
+ * year = {2012},
+ * month = {July},
+ * booktitle = {Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
+ * address = {Jeju Island, Korea},
+ * pages = {1169--1178},
+ * url = {http://kheafield.com/professional/edinburgh/rest\_paper.pdf},
+ * }
+ * This is particularly convenient to calculate during interpolation because
+ * the needed backoff terms are already accessed at the same time.
+ */
+class OutputQ {
+ public:
+ explicit OutputQ(std::size_t order) : q_delta_(order) {}
+
+ void Gram(unsigned order_minus_1, float full_backoff, ProbBackoff &out) {
+ float &q_del = q_delta_[order_minus_1];
+ if (order_minus_1) {
+ // Divide by context's backoff (which comes in as out.backoff)
+ q_del = q_delta_[order_minus_1 - 1] / out.backoff * full_backoff;
+ } else {
+ q_del = full_backoff;
+ }
+ out.prob = log10f(out.prob * q_del);
+ // TODO: stop wastefully outputting this!
+ out.backoff = 0.0;
+ }
+
+ private:
+ // Product of backoffs in the numerator divided by backoffs in the
+ // denominator. Does not include
+ std::vector<float> q_delta_;
+};
+
+/* Default: output probability and backoff */
+class OutputProbBackoff {
+ public:
+ explicit OutputProbBackoff(std::size_t /*order*/) {}
+
+ void Gram(unsigned /*order_minus_1*/, float full_backoff, ProbBackoff &out) const {
+ // Correcting for numerical precision issues. Take that IRST.
+ out.prob = std::min(0.0f, log10f(out.prob));
+ out.backoff = log10f(full_backoff);
+ }
+};
+
+template <class Output> class Callback {
public:
Callback(float uniform_prob, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds)
- : backoffs_(backoffs.size()), probs_(backoffs.size() + 2), prune_thresholds_(prune_thresholds) {
+ : backoffs_(backoffs.size()), probs_(backoffs.size() + 2),
+ prune_thresholds_(prune_thresholds),
+ output_(backoffs.size() + 1 /* order */) {
probs_[0] = uniform_prob;
for (std::size_t i = 0; i < backoffs.size(); ++i) {
backoffs_.push_back(backoffs[i]);
@@ -40,15 +92,9 @@ class Callback {
Payload &pay = gram.Value();
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
probs_[order_minus_1 + 1] = pay.complete.prob;
- pay.complete.prob = log10(pay.complete.prob);
-
- if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
- // This skips over ngrams if backoffs have been exhausted.
- if(!backoffs_[order_minus_1]) {
- pay.complete.backoff = 0.0;
- return;
- }
+ float out_backoff;
+ if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
if(prune_thresholds_[order_minus_1 + 1] > 0) {
//Compute hash value for current context
uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
@@ -58,20 +104,22 @@ class Callback {
hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
if(current_hash == hashed_backoff->hash_value) {
- pay.complete.backoff = log10(hashed_backoff->gamma);
+ out_backoff = hashed_backoff->gamma;
++backoffs_[order_minus_1];
} else {
// Has been pruned away so it is not a context anymore
- pay.complete.backoff = 0.0;
+ out_backoff = 1.0;
}
} else {
- pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
+ out_backoff = *static_cast<const float*>(backoffs_[order_minus_1].Get());
++backoffs_[order_minus_1];
}
} else {
// Not a context.
- pay.complete.backoff = 0.0;
+ out_backoff = 1.0;
}
+
+ output_.Gram(order_minus_1, out_backoff, pay.complete);
}
void Exit(unsigned, const NGram &) const {}
@@ -81,19 +129,29 @@ class Callback {
std::vector<float> probs_;
const std::vector<uint64_t>& prune_thresholds_;
+
+ Output output_;
};
} // namespace
-Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds)
+Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds, bool output_q)
: uniform_prob_(1.0 / static_cast<float>(vocab_size)), // Includes <unk> but excludes <s>.
backoffs_(backoffs),
- prune_thresholds_(prune_thresholds) {}
+ prune_thresholds_(prune_thresholds),
+ output_q_(output_q) {}
// perform order-wise interpolation
void Interpolate::Run(const util::stream::ChainPositions &positions) {
assert(positions.size() == backoffs_.size() + 1);
- Callback callback(uniform_prob_, backoffs_, prune_thresholds_);
- JointOrder<Callback, SuffixOrder>(positions, callback);
+ if (output_q_) {
+ typedef Callback<OutputQ> C;
+ C callback(uniform_prob_, backoffs_, prune_thresholds_);
+ JointOrder<C, SuffixOrder>(positions, callback);
+ } else {
+ typedef Callback<OutputProbBackoff> C;
+ C callback(uniform_prob_, backoffs_, prune_thresholds_);
+ JointOrder<C, SuffixOrder>(positions, callback);
+ }
}
}} // namespaces
diff --git a/lm/builder/interpolate.hh b/lm/builder/interpolate.hh
index 55a55428f..0acece926 100644
--- a/lm/builder/interpolate.hh
+++ b/lm/builder/interpolate.hh
@@ -18,7 +18,7 @@ class Interpolate {
public:
// Normally vocab_size is the unigram count-1 (since p(<s>) = 0) but might
// be larger when the user specifies a consistent vocabulary size.
- explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds);
+ explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool output_q_);
void Run(const util::stream::ChainPositions &positions);
@@ -26,6 +26,7 @@ class Interpolate {
float uniform_prob_;
util::stream::ChainPositions backoffs_;
const std::vector<uint64_t> prune_thresholds_;
+ bool output_q_;
};
}} // namespaces
diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc
index e1ae2d417..265dd2164 100644
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@@ -33,7 +33,6 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::s
// convert to vector of integers
std::vector<uint64_t> prune_thresholds;
prune_thresholds.reserve(order);
- std::cerr << "Pruning ";
for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
try {
prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
@@ -66,6 +65,18 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::s
return prune_thresholds;
}
+lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> &param) {
+ lm::builder::Discount ret;
+ UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+");
+ UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified");
+ ret.amount[0] = 0.0;
+ for (unsigned i = 0; i < 3; ++i) {
+ float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]);
+ UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
+ ret.amount[i + 1] = discount;
+ }
+ return ret;
+}
} // namespace
@@ -77,7 +88,11 @@ int main(int argc, char *argv[]) {
std::string text, arpa;
std::vector<std::string> pruning;
-
+ std::vector<std::string> discount_fallback;
+ std::vector<std::string> discount_fallback_default;
+ discount_fallback_default.push_back("0.5");
+ discount_fallback_default.push_back("1");
+ discount_fallback_default.push_back("1.5");
options.add_options()
("help,h", po::bool_switch(), "Show this help message")
@@ -86,7 +101,7 @@ int main(int argc, char *argv[]) {
->required()
#endif
, "Order of the model")
- ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)")
+ ("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI. If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
@@ -99,7 +114,9 @@ int main(int argc, char *argv[]) {
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
- ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Unigram pruning is not implemented, so the first value must be zero. Default is to not prune, which is equivalent to --prune 0.");
+ ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
+ ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Unigram pruning is not implemented, so the first value must be zero. Default is to not prune, which is equivalent to --prune 0.")
+ ("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, options), vm);
@@ -143,7 +160,7 @@ int main(int argc, char *argv[]) {
#endif
if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
- std::cerr << "--vocab_pad requires --interpolate_unigrams" << std::endl;
+ std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl;
return 1;
}
@@ -153,6 +170,15 @@ int main(int argc, char *argv[]) {
pipeline.disallowed_symbol_action = lm::THROW_UP;
}
+ if (vm.count("discount_fallback")) {
+ pipeline.discount.fallback = ParseDiscountFallback(discount_fallback);
+ pipeline.discount.bad_action = lm::COMPLAIN;
+ } else {
+ // Unused, just here to prevent the compiler from complaining about uninitialized.
+ pipeline.discount.fallback = lm::builder::Discount();
+ pipeline.discount.bad_action = lm::THROW_UP;
+ }
+
// parse pruning thresholds. These depend on order, so it is not done as a notifier.
pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);
diff --git a/lm/builder/pipeline.cc b/lm/builder/pipeline.cc
index e91870808..21064ab3a 100644
--- a/lm/builder/pipeline.cc
+++ b/lm/builder/pipeline.cc
@@ -280,7 +280,7 @@ void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &maste
gamma_chains.push_back(read_backoffs);
gamma_chains.back() >> gammas[i].Source();
}
- master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds);
+ master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.output_q);
gamma_chains >> util::stream::kRecycle;
master.BufferFinal(counts);
}
@@ -317,7 +317,7 @@ void Pipeline(PipelineConfig config, int text_file, int out_arpa) {
std::vector<uint64_t> counts;
std::vector<uint64_t> counts_pruned;
std::vector<Discount> discounts;
- master >> AdjustCounts(counts, counts_pruned, discounts, config.prune_thresholds);
+ master >> AdjustCounts(config.prune_thresholds, counts, counts_pruned, config.discount, discounts);
{
util::FixedArray<util::stream::FileBuffer> gammas;
diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh
index 4395622ed..09e1a4d52 100644
--- a/lm/builder/pipeline.hh
+++ b/lm/builder/pipeline.hh
@@ -1,6 +1,7 @@
#ifndef LM_BUILDER_PIPELINE_H
#define LM_BUILDER_PIPELINE_H
+#include "lm/builder/adjust_counts.hh"
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/header_info.hh"
#include "lm/lm_exception.hh"
@@ -19,6 +20,8 @@ struct PipelineConfig {
util::stream::SortConfig sort;
InitialProbabilitiesConfig initial_probs;
util::stream::ChainConfig read_backoffs;
+
+ // Include a header in the ARPA with some statistics?
bool verbose_header;
// Estimated vocabulary size. Used for sizing CorpusCount memory and
@@ -34,6 +37,12 @@ struct PipelineConfig {
// n-gram count thresholds for pruning. 0 values means no pruning for
// corresponding n-gram order
std::vector<uint64_t> prune_thresholds; //mjd
+
+ // What to do with discount failures.
+ DiscountConfig discount;
+
+ // Compute collapsed q values instead of probability and backoff
+ bool output_q;
/* Computing the perplexity of LMs with different vocabularies is hard. For
* example, the lowest perplexity is attained by a unigram model that
diff --git a/lm/builder/print.cc b/lm/builder/print.cc
index 75f15f0a6..aee6e1341 100644
--- a/lm/builder/print.cc
+++ b/lm/builder/print.cc
@@ -50,7 +50,7 @@ void PrintARPA::Run(const util::stream::ChainPositions &positions) {
out << "\\" << order << "-grams:" << '\n';
for (NGramStream stream(positions[order - 1]); stream; ++stream) {
// Correcting for numerical precision issues. Take that IRST.
- out << std::min(0.0f, stream->Value().complete.prob) << '\t' << vocab_.Lookup(*stream->begin());
+ out << stream->Value().complete.prob << '\t' << vocab_.Lookup(*stream->begin());
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
out << ' ' << vocab_.Lookup(*i);
}
diff --git a/mert/FeatureData.h b/mert/FeatureData.h
index 2510b3aee..6e79529d4 100644
--- a/mert/FeatureData.h
+++ b/mert/FeatureData.h
@@ -12,6 +12,7 @@
#include <vector>
#include <iostream>
#include <stdexcept>
+#include <boost/lexical_cast.hpp>
#include "FeatureArray.h"
namespace MosesTuning
@@ -103,7 +104,7 @@ public:
inline int getName(std::size_t idx) const {
idx2name::const_iterator i = m_index_to_array_name.find(idx);
if (i != m_index_to_array_name.end())
- throw std::runtime_error("there is no entry at index " + idx);
+ throw std::runtime_error("there is no entry at index " + boost::lexical_cast<std::string>(idx));
return i->second;
}
@@ -116,7 +117,7 @@ public:
throw std::runtime_error("Error: you required an too big index");
std::map<std::size_t, std::string>::const_iterator it = m_index_to_feature_name.find(idx);
if (it == m_index_to_feature_name.end()) {
- throw std::runtime_error("Error: specified id is unknown: " + idx);
+ throw std::runtime_error("Error: specified id is unknown: " + boost::lexical_cast<std::string>(idx));
} else {
return it->second;
}
diff --git a/mert/MeteorScorer.cpp b/mert/MeteorScorer.cpp
index d26b8f0f3..914fd02d4 100644
--- a/mert/MeteorScorer.cpp
+++ b/mert/MeteorScorer.cpp
@@ -181,7 +181,7 @@ void MeteorScorer::setReferenceFiles(const vector<string>& referenceFiles) {}
void MeteorScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) {}
-float MeteorScorer::calculateScore(const vector<int>& comps) const
+float MeteorScorer::calculateScore(const vector<ScoreStatsType>& comps) const
{
// Should never be reached
return 0.0;
diff --git a/mert/MiraWeightVector.cpp b/mert/MiraWeightVector.cpp
index 3b7b1780c..c8a1ca774 100644
--- a/mert/MiraWeightVector.cpp
+++ b/mert/MiraWeightVector.cpp
@@ -139,7 +139,7 @@ ostream& operator<<(ostream& o, const MiraWeightVector& e)
for(size_t i=0; i<e.m_weights.size(); i++) {
if(abs(e.m_weights[i])>1e-8) {
if(i>0) o << " ";
- cerr << i << ":" << e.m_weights[i];
+ o << i << ":" << e.m_weights[i];
}
}
return o;
diff --git a/mert/ScoreData.h b/mert/ScoreData.h
index 9159e029f..5e96a2e06 100644
--- a/mert/ScoreData.h
+++ b/mert/ScoreData.h
@@ -13,6 +13,7 @@
#include <vector>
#include <stdexcept>
#include <string>
+#include <boost/lexical_cast.hpp>
#include "ScoreArray.h"
#include "ScoreStats.h"
@@ -108,7 +109,7 @@ public:
inline int getName(std::size_t idx) const {
idx2name::const_iterator i = m_index_to_array_name.find(idx);
if (i != m_index_to_array_name.end())
- throw std::runtime_error("there is no entry at index " + idx);
+ throw std::runtime_error("there is no entry at index " + boost::lexical_cast<std::string>(idx));
return i->second;
}
};
diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp
index 0be72e9f9..0abce8af4 100644
--- a/mert/kbmira.cpp
+++ b/mert/kbmira.cpp
@@ -72,6 +72,7 @@ int main(int argc, char** argv)
float decay = 0.999; // Pseudo-corpus decay \gamma
int n_iters = 60; // Max epochs J
bool streaming = false; // Stream all k-best lists?
+ bool streaming_out = false; // Stream output after each sentence?
bool no_shuffle = false; // Don't shuffle, even for in memory version
bool model_bg = false; // Use model for background corpus
bool verbose = false; // Verbose updates
@@ -97,6 +98,7 @@ int main(int argc, char** argv)
("dense-init,d", po::value<string>(&denseInitFile), "Weight file for dense features. This should have 'name= value' on each line, or (legacy) should be the Moses mert 'init.opt' format.")
("sparse-init,s", po::value<string>(&sparseInitFile), "Weight file for sparse features")
("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle")
+ ("streaming-out", po::value(&streaming_out)->zero_tokens()->default_value(false), "Stream weights to stdout after each sentence")
("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch")
("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background")
("verbose", po::value(&verbose)->zero_tokens()->default_value(false), "Verbose updates")
@@ -235,7 +237,8 @@ int main(int argc, char** argv)
}
// Training loop
- cerr << "Initial BLEU = " << decoder->Evaluate(wv.avg()) << endl;
+ if (!streaming_out)
+ cerr << "Initial BLEU = " << decoder->Evaluate(wv.avg()) << endl;
ValType bestBleu = 0;
for(int j=0; j<n_iters; j++) {
// MIRA train for one epoch
@@ -283,6 +286,8 @@ int main(int argc, char** argv)
}
iNumExamples++;
++sentenceIndex;
+ if (streaming_out)
+ cout << wv << endl;
}
// Training Epoch summary
cerr << iNumUpdates << "/" << iNumExamples << " updates"
diff --git a/mira/Decoder.cpp b/mira/Decoder.cpp
index 122106b96..5f5337312 100644
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@@ -143,7 +143,7 @@ vector< vector<const Word*> > MosesDecoder::runDecoder(const std::string& source
string filename)
{
// run the decoder
- m_manager = new Moses::Manager(0,*m_sentence, search);
+ m_manager = new Moses::Manager(*m_sentence, search);
m_manager->ProcessSentence();
TrellisPathList nBestList;
m_manager->CalcNBest(nBestSize, nBestList, distinct);
@@ -220,7 +220,7 @@ vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& s
size_t epoch)
{
// run the decoder
- m_chartManager = new ChartManager(0,*m_sentence);
+ m_chartManager = new ChartManager(*m_sentence);
m_chartManager->ProcessSentence();
ChartKBestExtractor::KBestVec nBestList;
m_chartManager->CalcNBest(nBestSize, nBestList, distinct);
diff --git a/misc/prunePhraseTable.cpp b/misc/prunePhraseTable.cpp
index 3086b4249..dcf8d73da 100644
--- a/misc/prunePhraseTable.cpp
+++ b/misc/prunePhraseTable.cpp
@@ -28,6 +28,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <cstring>
#include <iostream>
#include <fstream>
+#include <map>
#include <string>
#include <vector>
@@ -50,6 +51,7 @@ using namespace Moses;
using namespace std;
namespace po = boost::program_options;
+typedef multimap<float,string> Lines;
static void usage(const po::options_description& desc, char** argv) {
cerr << "Usage: " + string(argv[0]) + " [options] input-file output-file" << endl;
@@ -57,25 +59,22 @@ static void usage(const po::options_description& desc, char** argv) {
}
//Find top n translations of source, and send them to output
-static void outputTopN(const StringPiece& sourcePhraseString, PhraseDictionary* phraseTable, const std::vector<FactorType> &input, ostream& out) {
- //get list of target phrases
- Phrase sourcePhrase;
- sourcePhrase.CreateFromString(Input,input,sourcePhraseString,NULL);
- InputPath inputPath(sourcePhrase, NonTerminalSet(), WordsRange(0,sourcePhrase.GetSize()-1),NULL,NULL);
- InputPathList inputPaths;
- inputPaths.push_back(&inputPath);
- phraseTable->GetTargetPhraseCollectionBatch(inputPaths);
- const TargetPhraseCollection* targetPhrases = inputPath.GetTargetPhrases(*phraseTable);
-
-
-
-
- //print phrases
- const std::vector<FactorType>& output = StaticData::Instance().GetOutputFactorOrder();
- if (targetPhrases) {
- //if (targetPhrases->GetSize() > 10) cerr << "src " << sourcePhrase << " tgt count " << targetPhrases->GetSize() << endl;
- for (TargetPhraseCollection::const_iterator i = targetPhrases->begin(); i != targetPhrases->end(); ++i) {
- const TargetPhrase* targetPhrase = *i;
+static void outputTopN(Lines lines, size_t maxPhrases, ostream& out) {
+ size_t count = 0;
+ for (Lines::const_reverse_iterator i = lines.rbegin(); i != lines.rend(); ++i) {
+ out << i->second << endl;
+ ++count;
+ if (count >= maxPhrases) break;
+ }
+}
+/*
+static void outputTopN(const Phrase& sourcePhrase, const multimap<float,const TargetPhrase*>& targetPhrases,
+ size_t maxPhrases, const PhraseDictionary* phraseTable,
+ const vector<FactorType> & input, const vector<FactorType> & output, ostream& out) {
+ size_t count = 0;
+ for (multimap<float,const TargetPhrase*>::const_reverse_iterator i
+ = targetPhrases.rbegin(); i != targetPhrases.rend() && count < maxPhrases; ++i, ++count) {
+ const TargetPhrase* targetPhrase = i->second;
out << sourcePhrase.GetStringRep(input);
out << " ||| ";
out << targetPhrase->GetStringRep(output);
@@ -92,15 +91,13 @@ static void outputTopN(const StringPiece& sourcePhraseString, PhraseDictionary*
}
out << endl;
}
- }
-
-}
-
+}*/
int main(int argc, char** argv)
{
bool help;
string input_file;
string config_file;
+ size_t maxPhrases = 100;
po::options_description desc("Allowed options");
@@ -108,13 +105,14 @@ int main(int argc, char** argv)
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("input-file,i", po::value<string>(&input_file), "Input file")
("config-file,f", po::value<string>(&config_file), "Config file")
+ ("max-phrases,n", po::value<size_t>(&maxPhrases), "Maximum target phrases per source phrase")
;
po::options_description cmdline_options;
cmdline_options.add(desc);
po::variables_map vm;
po::parsed_options parsed = po::command_line_parser(argc,argv).
- options(cmdline_options).allow_unregistered().run();
+ options(cmdline_options).run();
po::store(parsed, vm);
po::notify(vm);
if (help) {
@@ -136,24 +134,6 @@ int main(int argc, char** argv)
mosesargs.push_back(argv[0]);
mosesargs.push_back("-f");
mosesargs.push_back(config_file);
- for (size_t i = 0; i < parsed.options.size(); ++i) {
- if (parsed.options[i].position_key == -1 && !parsed.options[i].unregistered) continue;
- /*
- const string& key = parsed.options[i].string_key;
- if (!key.empty()) {
- mosesargs.push_back(key);
- }
- for (size_t j = 0; j < parsed.options[i].value.size(); ++j) {
- const string& value = parsed.options[i].value[j];
- if (!value.empty()) {
- mosesargs.push_back(value);
- }
- }*/
-
- for (size_t j = 0; j < parsed.options[i].original_tokens.size(); ++j) {
- mosesargs.push_back(parsed.options[i].original_tokens[j]);
- }
- }
boost::scoped_ptr<Parameter> params(new Parameter());
char** mosesargv = new char*[mosesargs.size()];
@@ -172,9 +152,8 @@ int main(int argc, char** argv)
}
const StaticData &staticData = StaticData::Instance();
- const std::vector<FactorType> & input = staticData.GetInputFactorOrder();
- //Find the phrase table to evaluate with
+ //Find the phrase table to manage the target phrases
PhraseDictionary* phraseTable = NULL;
const vector<FeatureFunction*>& ffs = FeatureFunction::GetFeatureFunctions();
for (size_t i = 0; i < ffs.size(); ++i) {
@@ -186,15 +165,11 @@ int main(int argc, char** argv)
}
UTIL_THROW_IF(!phraseTable,util::Exception,"Unable to find scoring phrase table");
- Sentence sentence;
- phraseTable->InitializeForInput(sentence);
//
//Load and prune the phrase table. This is taken (with mods) from moses/TranslationModel/RuleTable/LoaderStandard.cpp
//
- string lineOrig;
-
std::ostream *progress = NULL;
IFVERBOSE(1) progress = &std::cerr;
util::FilePiece in(input_file.c_str(), progress);
@@ -205,7 +180,9 @@ int main(int argc, char** argv)
double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
- StringPiece previous;
+ string previous;
+ Lines lines;
+
while(true) {
try {
@@ -216,12 +193,31 @@ int main(int argc, char** argv)
util::TokenIter<util::MultiCharacter> pipes(line, "|||");
StringPiece sourcePhraseString(*pipes);
+ StringPiece targetPhraseString(*++pipes);
+ StringPiece scoreString(*++pipes);
+ scoreVector.clear();
+ for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+ int processed;
+ float score = converter.StringToFloat(s->data(), s->length(), &processed);
+ UTIL_THROW_IF2(isnan(score), "Bad score " << *s);
+ scoreVector.push_back(FloorScore(TransformScore(score)));
+ }
+
if (sourcePhraseString != previous) {
- outputTopN(previous, phraseTable, input, cout);
- previous = sourcePhraseString;
+ outputTopN(lines, maxPhrases, cout);
+ previous = sourcePhraseString.as_string();
+ lines.clear();
}
+
+ ScoreComponentCollection scores;
+ scores.Assign(phraseTable,scoreVector);
+ float score = scores.InnerProduct(staticData.GetAllWeights());
+ lines.insert(pair<float,string>(score,line.as_string()));
+
+ }
+ if (!lines.empty()) {
+ outputTopN(lines, maxPhrases, cout);
}
- outputTopN(previous, phraseTable, input, cout);
diff --git a/moses-chart-cmd/IOWrapper.h b/moses-chart-cmd/IOWrapper.h
deleted file mode 100644
index 974353190..000000000
--- a/moses-chart-cmd/IOWrapper.h
+++ /dev/null
@@ -1,151 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#pragma once
-
-#include <fstream>
-#include <vector>
-#include <set>
-#include "moses/TypeDef.h"
-#include "moses/Sentence.h"
-#include "moses/FactorTypeSet.h"
-#include "moses/ChartKBestExtractor.h"
-#include "moses/OutputCollector.h"
-#include "moses/ChartHypothesis.h"
-#include "search/applied.hh"
-#include "moses/ChartManager.h"
-
-namespace Moses
-{
-class FactorCollection;
-class ScoreComponentCollection;
-}
-
-namespace MosesChartCmd
-{
-
-/** Helper class that holds misc variables to write data out to command line.
- */
-class IOWrapper
-{
-protected:
- typedef std::vector<std::pair<Moses::Word, Moses::WordsRange> > ApplicationContext;
-
- long m_translationId;
-
- const std::vector<Moses::FactorType> &m_inputFactorOrder;
- const std::vector<Moses::FactorType> &m_outputFactorOrder;
- const Moses::FactorMask &m_inputFactorUsed;
- std::ostream *m_outputSearchGraphStream;
- std::ostream *m_detailedTranslationReportingStream;
- std::ostream *m_detailedTreeFragmentsTranslationReportingStream;
- //DIMw
- std::ostream *m_detailedAllTranslationReportingStream;
- std::ostream *m_alignmentInfoStream;
- std::ostream *m_unknownsStream;
- std::string m_inputFilePath;
- std::istream *m_inputStream;
- Moses::OutputCollector *m_detailOutputCollector;
- Moses::OutputCollector *m_detailTreeFragmentsOutputCollector;
- //DIMw
- Moses::OutputCollector *m_detailAllOutputCollector;
- Moses::OutputCollector *m_nBestOutputCollector;
- Moses::OutputCollector *m_searchGraphOutputCollector;
- Moses::OutputCollector *m_singleBestOutputCollector;
- Moses::OutputCollector *m_alignmentInfoCollector;
- Moses::OutputCollector *m_unknownsCollector;
-
- typedef std::set< std::pair<size_t, size_t> > Alignments;
- std::size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartKBestExtractor::Derivation &derivation, std::size_t startTarget);
- size_t OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget);
- void OutputAlignment(std::vector< std::set<size_t> > &retAlignmentsS2T, const Moses::AlignmentInfo &ai);
- void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void OutputTreeFragmentsTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputTreeFragmentsTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void ReconstructApplicationContext(const Moses::ChartHypothesis &hypo,
- const Moses::Sentence &sentence,
- ApplicationContext &context);
- void ReconstructApplicationContext(const search::Applied *applied,
- const Moses::Sentence &sentence,
- ApplicationContext &context);
- void WriteApplicationContext(std::ostream &out,
- const ApplicationContext &context);
-
- void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out);
- void OutputFeatureScores( std::ostream& out
- , const Moses::ScoreComponentCollection &features
- , const Moses::FeatureFunction *ff
- , std::string &lastName );
-
-public:
- IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &inputFilePath="");
- ~IOWrapper();
-
- Moses::InputType* GetInput(Moses::InputType *inputType);
- void OutputBestHypo(const Moses::ChartHypothesis *hypo, long translationId);
- void OutputBestHypo(search::Applied applied, long translationId);
- void OutputBestHypo(const std::vector<const Moses::Factor*>& mbrBestHypo, long translationId);
- void OutputBestNone(long translationId);
- void OutputNBestList(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, long translationId);
- void OutputNBestList(const std::vector<search::Applied> &nbest, long translationId);
- void OutputNBestTrees(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, long translationId);
- void OutputDetailedTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputDetailedTranslationReport(const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void OutputDetailedTreeFragmentsTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputDetailedTreeFragmentsTranslationReport(const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void OutputDetailedAllTranslationReport(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, const Moses::ChartManager &manager, const Moses::Sentence &sentence, long translationId);
- void Backtrack(const Moses::ChartHypothesis *hypo);
-
- void ResetTranslationId();
-
- Moses::OutputCollector *GetSearchGraphOutputCollector() {
- return m_searchGraphOutputCollector;
- }
-
- void OutputAlignment(size_t translationId , const Moses::ChartHypothesis *hypo);
- void OutputUnknowns(const std::vector<Moses::Phrase*> &, long);
-
- static void FixPrecision(std::ostream &, size_t size=3);
-};
-
-}
diff --git a/moses-chart-cmd/Jamfile b/moses-chart-cmd/Jamfile
deleted file mode 100644
index 47a1bf885..000000000
--- a/moses-chart-cmd/Jamfile
+++ /dev/null
@@ -1,2 +0,0 @@
-exe moses_chart : Main.cpp mbr.cpp IOWrapper.cpp ../moses//moses $(TOP)//boost_iostreams ..//boost_filesystem ..//z ;
-
diff --git a/moses-chart-cmd/Main.cpp b/moses-chart-cmd/Main.cpp
deleted file mode 100644
index f5f053ada..000000000
--- a/moses-chart-cmd/Main.cpp
+++ /dev/null
@@ -1,385 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#ifdef WIN32
-// Include Visual Leak Detector
-//#include <vld.h>
-#endif
-
-#include <exception>
-#include <fstream>
-#include "Main.h"
-#include "moses/TranslationAnalysis.h"
-#include "mbr.h"
-#include "IOWrapper.h"
-
-#include "moses/FactorCollection.h"
-#include "moses/HypergraphOutput.h"
-#include "moses/Manager.h"
-#include "moses/Phrase.h"
-#include "moses/Util.h"
-#include "moses/Timer.h"
-#include "moses/Sentence.h"
-#include "moses/ConfusionNet.h"
-#include "moses/WordLattice.h"
-#include "moses/TreeInput.h"
-#include "moses/ThreadPool.h"
-#include "moses/ChartManager.h"
-#include "moses/ChartHypothesis.h"
-#include "moses/Incremental.h"
-#include "moses/FF/StatefulFeatureFunction.h"
-#include "moses/FF/StatelessFeatureFunction.h"
-
-#include "util/usage.hh"
-#include "util/exception.hh"
-
-using namespace std;
-using namespace Moses;
-using namespace MosesChartCmd;
-
-/** Enforce rounding */
-void fix(std::ostream& stream, size_t size)
-{
- stream.setf(std::ios::fixed);
- stream.precision(size);
-}
-
-/**
- * Translates a sentence.
- **/
-class TranslationTask : public Task
-{
-public:
- TranslationTask(InputType *source, IOWrapper &ioWrapper,
- boost::shared_ptr<HypergraphOutput<ChartManager> > hypergraphOutput)
- : m_source(source)
- , m_ioWrapper(ioWrapper)
- , m_hypergraphOutput(hypergraphOutput) {
- }
-
- ~TranslationTask() {
- delete m_source;
- }
-
- void Run() {
- const StaticData &staticData = StaticData::Instance();
- const size_t translationId = m_source->GetTranslationId();
-
- VERBOSE(2,"\nTRANSLATING(" << translationId << "): " << *m_source);
-
- if (staticData.GetSearchAlgorithm() == ChartIncremental) {
- Incremental::Manager manager(*m_source);
- const std::vector<search::Applied> &nbest = manager.ProcessSentence();
- if (!nbest.empty()) {
- m_ioWrapper.OutputBestHypo(nbest[0], translationId);
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- m_ioWrapper.OutputDetailedTranslationReport(&nbest[0], sentence, translationId);
- }
- if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- m_ioWrapper.OutputDetailedTreeFragmentsTranslationReport(&nbest[0], sentence, translationId);
- }
- } else {
- m_ioWrapper.OutputBestNone(translationId);
- }
- if (staticData.GetNBestSize() > 0)
- m_ioWrapper.OutputNBestList(nbest, translationId);
- return;
- }
-
- ChartManager manager(translationId,*m_source);
- manager.ProcessSentence();
-
- UTIL_THROW_IF2(staticData.UseMBR(), "Cannot use MBR");
-
- // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
- if (m_hypergraphOutput.get()) {
- m_hypergraphOutput->Write(manager);
- }
-
-
- // 1-best
- const ChartHypothesis *bestHypo = manager.GetBestHypothesis();
- m_ioWrapper.OutputBestHypo(bestHypo, translationId);
- IFVERBOSE(2) {
- PrintUserTime("Best Hypothesis Generation Time:");
- }
-
- if (!staticData.GetAlignmentOutputFile().empty()) {
- m_ioWrapper.OutputAlignment(translationId, bestHypo);
- }
-
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- m_ioWrapper.OutputDetailedTranslationReport(bestHypo, sentence, translationId);
- }
- if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- m_ioWrapper.OutputDetailedTreeFragmentsTranslationReport(bestHypo, sentence, translationId);
- }
- if (!staticData.GetOutputUnknownsFile().empty()) {
- m_ioWrapper.OutputUnknowns(manager.GetParser().GetUnknownSources(),
- translationId);
- }
-
- //DIMw
- if (staticData.IsDetailedAllTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- size_t nBestSize = staticData.GetNBestSize();
- std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
- manager.CalcNBest(nBestSize, nBestList, staticData.GetDistinctNBest());
- m_ioWrapper.OutputDetailedAllTranslationReport(nBestList, manager, sentence, translationId);
- }
-
- // n-best
- size_t nBestSize = staticData.GetNBestSize();
- if (nBestSize > 0) {
- VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl);
- std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
- manager.CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest());
- m_ioWrapper.OutputNBestList(nBestList, translationId);
- IFVERBOSE(2) {
- PrintUserTime("N-Best Hypotheses Generation Time:");
- }
- }
-
- if (staticData.GetOutputSearchGraph()) {
- std::ostringstream out;
- manager.OutputSearchGraphMoses( out);
- OutputCollector *oc = m_ioWrapper.GetSearchGraphOutputCollector();
- UTIL_THROW_IF2(oc == NULL, "File for search graph output not specified");
- oc->Write(translationId, out.str());
- }
-
- IFVERBOSE(2) {
- PrintUserTime("Sentence Decoding Time:");
- }
- manager.CalcDecoderStatistics();
- }
-
-private:
- // Non-copyable: copy constructor and assignment operator not implemented.
- TranslationTask(const TranslationTask &);
- TranslationTask &operator=(const TranslationTask &);
-
- InputType *m_source;
- IOWrapper &m_ioWrapper;
- boost::shared_ptr<HypergraphOutput<ChartManager> > m_hypergraphOutput;
-};
-
-bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
-{
- delete source;
- switch(inputType) {
- case SentenceInput:
- source = ioWrapper.GetInput(new Sentence);
- break;
- case ConfusionNetworkInput:
- source = ioWrapper.GetInput(new ConfusionNet);
- break;
- case WordLatticeInput:
- source = ioWrapper.GetInput(new WordLattice);
- break;
- case TreeInputType:
- source = ioWrapper.GetInput(new TreeInput);
- break;
- default:
- TRACE_ERR("Unknown input type: " << inputType << "\n");
- }
- return (source ? true : false);
-}
-static void PrintFeatureWeight(const FeatureFunction* ff)
-{
- cout << ff->GetScoreProducerDescription() << "=";
- size_t numScoreComps = ff->GetNumScoreComponents();
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- for (size_t i = 0; i < numScoreComps; ++i) {
- cout << " " << values[i];
- }
- cout << endl;
-
-}
-
-static void ShowWeights()
-{
- fix(cout,6);
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
-
- for (size_t i = 0; i < sff.size(); ++i) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- }
- else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- }
- else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
-}
-
-
-int main(int argc, char* argv[])
-{
- try {
- IFVERBOSE(1) {
- TRACE_ERR("command: ");
- for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
- TRACE_ERR(endl);
- }
-
- IOWrapper::FixPrecision(cout);
- IOWrapper::FixPrecision(cerr);
-
- // load data structures
- Parameter parameter;
- if (!parameter.LoadParam(argc, argv)) {
- return EXIT_FAILURE;
- }
-
- const StaticData &staticData = StaticData::Instance();
- if (!StaticData::LoadDataStatic(&parameter, argv[0]))
- return EXIT_FAILURE;
-
- if (parameter.isParamSpecified("show-weights")) {
- ShowWeights();
- exit(0);
- }
-
- UTIL_THROW_IF2(!staticData.IsChart(), "Must be SCFG model");
-
- // set up read/writing class
- IOWrapper *ioWrapper = GetIOWrapper(staticData);
-
- // check on weights
- const ScoreComponentCollection& weights = staticData.GetAllWeights();
- IFVERBOSE(2) {
- TRACE_ERR("The global weight vector looks like this: ");
- TRACE_ERR(weights);
- TRACE_ERR("\n");
- }
-
- boost::shared_ptr<HypergraphOutput<ChartManager> > hypergraphOutput;
- if (staticData.GetOutputSearchGraphHypergraph()) {
- hypergraphOutput.reset(new HypergraphOutput<ChartManager>(3));
- }
-
- if (ioWrapper == NULL)
- return EXIT_FAILURE;
-
-#ifdef WITH_THREADS
- ThreadPool pool(staticData.ThreadCount());
-#endif
-
- // read each sentence & decode
- InputType *source=0;
- while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
- IFVERBOSE(1)
- ResetUserTime();
-
- FeatureFunction::CallChangeSource(source);
-
- TranslationTask *task = new TranslationTask(source, *ioWrapper, hypergraphOutput);
- source = NULL; // task will delete source
-#ifdef WITH_THREADS
- pool.Submit(task); // pool will delete task
-#else
- task->Run();
- delete task;
-#endif
- }
-
-#ifdef WITH_THREADS
- pool.Stop(true); // flush remaining jobs
-#endif
-
- delete ioWrapper;
- FeatureFunction::Destroy();
-
- IFVERBOSE(1)
- PrintUserTime("End.");
-
- } catch (const std::exception &e) {
- std::cerr << "Exception: " << e.what() << std::endl;
- return EXIT_FAILURE;
- }
-
- IFVERBOSE(1) util::PrintUsage(std::cerr);
-
-#ifndef EXIT_RETURN
- //This avoids that detructors are called (it can take a long time)
- exit(EXIT_SUCCESS);
-#else
- return EXIT_SUCCESS;
-#endif
-}
-
-IOWrapper *GetIOWrapper(const StaticData &staticData)
-{
- IOWrapper *ioWrapper;
- const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
- ,&outputFactorOrder = staticData.GetOutputFactorOrder();
- FactorMask inputFactorUsed(inputFactorOrder);
-
- // io
- if (staticData.GetParam("input-file").size() == 1) {
- VERBOSE(2,"IO from File" << endl);
- string filePath = staticData.GetParam("input-file")[0];
-
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath()
- , filePath);
- } else {
- VERBOSE(1,"IO from STDOUT/STDIN" << endl);
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath());
- }
- ioWrapper->ResetTranslationId();
-
- IFVERBOSE(1)
- PrintUserTime("Created input-output object");
-
- return ioWrapper;
-}
diff --git a/moses-chart-cmd/Main.h b/moses-chart-cmd/Main.h
deleted file mode 100644
index 319e3889c..000000000
--- a/moses-chart-cmd/Main.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#pragma once
-
-#include "moses/StaticData.h"
-
-namespace MosesChartCmd
-{
-class IOWrapper;
-}
-
-int main(int argc, char* argv[]);
-MosesChartCmd::IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
diff --git a/moses-chart-cmd/mbr.cpp b/moses-chart-cmd/mbr.cpp
deleted file mode 100644
index 551378054..000000000
--- a/moses-chart-cmd/mbr.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <iomanip>
-#include <vector>
-#include <map>
-#include <stdlib.h>
-#include <math.h>
-#include <algorithm>
-#include <stdio.h>
-#include "moses/TrellisPathList.h"
-#include "moses/TrellisPath.h"
-#include "moses/StaticData.h"
-#include "moses/Util.h"
-
-#include "mbr.h"
-
-using namespace std ;
-using namespace Moses;
-
-
-/* Input :
- 1. a sorted n-best list, with duplicates filtered out in the following format
- 0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
-
- 2. a weight vector
- 3. bleu order ( default = 4)
- 4. scaling factor to weigh the weight vector (default = 1.0)
-
- Output :
- translations that minimise the Bayes Risk of the n-best list
-
-
-*/
-
-int BLEU_ORDER = 4;
-int SMOOTH = 1;
-int DEBUG = 0;
-float min_interval = 1e-4;
-void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int > & allngrams)
-{
- vector< const Factor* > ngram;
- for (int k = 0; k < BLEU_ORDER; k++) {
- for(int i =0; i < max((int)sentence.size()-k,0); i++) {
- for ( int j = i; j<= i+k; j++) {
- ngram.push_back(sentence[j]);
- }
- ++allngrams[ngram];
- ngram.clear();
- }
- }
-}
-
-float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats )
-{
- int comps_n = 2*BLEU_ORDER+1;
- vector<int> comps(comps_n);
- float logbleu = 0.0, brevity;
-
- int hyp_length = sents[hyp].size();
-
- for (int i =0; i<BLEU_ORDER; i++) {
- comps[2*i] = 0;
- comps[2*i+1] = max(hyp_length-i,0);
- }
-
- map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
- map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
-
- for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
- it != hyp_ngrams.end(); it++) {
- map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
- if(ref_it != ref_ngrams.end()) {
- comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
- }
- }
- comps[comps_n-1] = sents[ref].size();
-
- if (DEBUG) {
- for ( int i = 0; i < comps_n; i++)
- cerr << "Comp " << i << " : " << comps[i];
- }
-
- for (int i=0; i<BLEU_ORDER; i++) {
- if (comps[0] == 0)
- return 0.0;
- if ( i > 0 )
- logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
- else
- logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
- }
- logbleu /= BLEU_ORDER;
- brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
- if (brevity < 0.0)
- logbleu += brevity;
- return exp(logbleu);
-}
-
-vector<const Factor*> doMBR(const TrellisPathList& nBestList)
-{
-// cerr << "Sentence " << sent << " has " << sents.size() << " candidate translations" << endl;
- float marginal = 0;
-
- vector<float> joint_prob_vec;
- vector< vector<const Factor*> > translations;
- float joint_prob;
- vector< map < vector <const Factor *>, int > > ngram_stats;
-
- TrellisPathList::const_iterator iter;
- //TrellisPath* hyp = NULL;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const TrellisPath &path = **iter;
- joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore());
- marginal += joint_prob;
- joint_prob_vec.push_back(joint_prob);
- //Cache ngram counts
- map < vector < const Factor *>, int > counts;
- vector<const Factor*> translation;
- GetOutputFactors(path, translation);
-
- //TO DO
- extract_ngrams(translation,counts);
- ngram_stats.push_back(counts);
- translations.push_back(translation);
- }
-
- vector<float> mbr_loss;
- float bleu, weightedLoss;
- float weightedLossCumul = 0;
- float minMBRLoss = 1000000;
- int minMBRLossIdx = -1;
-
- /* Main MBR computation done here */
- for (size_t i = 0; i < nBestList.GetSize(); i++) {
- weightedLossCumul = 0;
- for (size_t j = 0; j < nBestList.GetSize(); j++) {
- if ( i != j) {
- bleu = calculate_score(translations, j, i,ngram_stats );
- weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
- weightedLossCumul += weightedLoss;
- if (weightedLossCumul > minMBRLoss)
- break;
- }
- }
- if (weightedLossCumul < minMBRLoss) {
- minMBRLoss = weightedLossCumul;
- minMBRLossIdx = i;
- }
- }
- /* Find sentence that minimises Bayes Risk under 1- BLEU loss */
- return translations[minMBRLossIdx];
-}
-
-void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation)
-{
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
- const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
- assert (outputFactorOrder.size() == 1);
-
- // print the surface factor of the translation
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const Phrase &phrase = edge.GetCurrTargetPhrase();
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
-
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
- translation.push_back(factor);
- }
- }
-}
-
diff --git a/moses-chart-cmd/mbr.h b/moses-chart-cmd/mbr.h
deleted file mode 100644
index cd40a13b1..000000000
--- a/moses-chart-cmd/mbr.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-
-#include <vector>
-#include <map>
-#include "moses/TrellisPathList.h"
-#include "moses/TrellisPath.h"
-#include "moses/Factor.h"
-
-std::vector<const Moses::Factor*> doMBR(const Moses::TrellisPathList& nBestList);
-void GetOutputFactors(const Moses::TrellisPath &path, std::vector <const Moses::Factor*> &translation);
-float calculate_score(const std::vector< std::vector<const Moses::Factor*> > & sents, int ref, int hyp, std::vector < std::map < std::vector < const Moses::Factor *>, int > > & ngram_stats );
-
diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
deleted file mode 100644
index 1483e0472..000000000
--- a/moses-cmd/IOWrapper.cpp
+++ /dev/null
@@ -1,661 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ***********************************************************************/
-
-// example file on how to use moses library
-
-#include <iostream>
-#include <stack>
-#include <boost/algorithm/string.hpp>
-
-#include "moses/TypeDef.h"
-#include "moses/Util.h"
-#include "moses/Hypothesis.h"
-#include "moses/WordsRange.h"
-#include "moses/TrellisPathList.h"
-#include "moses/StaticData.h"
-#include "moses/FeatureVector.h"
-#include "moses/InputFileStream.h"
-#include "moses/FF/StatefulFeatureFunction.h"
-#include "moses/FF/StatelessFeatureFunction.h"
-#include "util/exception.hh"
-
-#include "IOWrapper.h"
-
-using namespace std;
-using namespace Moses;
-
-namespace MosesCmd
-{
-
-IOWrapper::IOWrapper(
- const vector<FactorType> &inputFactorOrder
- , const vector<FactorType> &outputFactorOrder
- , const FactorMask &inputFactorUsed
- , size_t nBestSize
- , const string &nBestFilePath)
- :m_inputFactorOrder(inputFactorOrder)
- ,m_outputFactorOrder(outputFactorOrder)
- ,m_inputFactorUsed(inputFactorUsed)
- ,m_inputFile(NULL)
- ,m_inputStream(&std::cin)
- ,m_nBestStream(NULL)
- ,m_outputWordGraphStream(NULL)
- ,m_outputSearchGraphStream(NULL)
- ,m_detailedTranslationReportingStream(NULL)
- ,m_alignmentOutputStream(NULL)
-{
- Initialization(inputFactorOrder, outputFactorOrder
- , inputFactorUsed
- , nBestSize, nBestFilePath);
-}
-
-IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
- , const std::vector<FactorType> &outputFactorOrder
- , const FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &inputFilePath)
- :m_inputFactorOrder(inputFactorOrder)
- ,m_outputFactorOrder(outputFactorOrder)
- ,m_inputFactorUsed(inputFactorUsed)
- ,m_inputFilePath(inputFilePath)
- ,m_inputFile(new InputFileStream(inputFilePath))
- ,m_nBestStream(NULL)
- ,m_outputWordGraphStream(NULL)
- ,m_outputSearchGraphStream(NULL)
- ,m_detailedTranslationReportingStream(NULL)
- ,m_alignmentOutputStream(NULL)
-{
- Initialization(inputFactorOrder, outputFactorOrder
- , inputFactorUsed
- , nBestSize, nBestFilePath);
-
- m_inputStream = m_inputFile;
-}
-
-IOWrapper::~IOWrapper()
-{
- if (m_inputFile != NULL)
- delete m_inputFile;
- if (m_nBestStream != NULL && !m_surpressSingleBestOutput) {
- // outputting n-best to file, rather than stdout. need to close file and delete obj
- delete m_nBestStream;
- }
- if (m_outputWordGraphStream != NULL) {
- delete m_outputWordGraphStream;
- }
- if (m_outputSearchGraphStream != NULL) {
- delete m_outputSearchGraphStream;
- }
- delete m_detailedTranslationReportingStream;
- delete m_alignmentOutputStream;
-}
-
-void IOWrapper::Initialization(const std::vector<FactorType> &/*inputFactorOrder*/
- , const std::vector<FactorType> &/*outputFactorOrder*/
- , const FactorMask &/*inputFactorUsed*/
- , size_t nBestSize
- , const std::string &nBestFilePath)
-{
- const StaticData &staticData = StaticData::Instance();
-
- // n-best
- m_surpressSingleBestOutput = false;
-
- if (nBestSize > 0) {
- if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") {
- m_nBestStream = &std::cout;
- m_surpressSingleBestOutput = true;
- } else {
- std::ofstream *file = new std::ofstream;
- m_nBestStream = file;
- file->open(nBestFilePath.c_str());
- }
- }
-
- // wordgraph output
- if (staticData.GetOutputWordGraph()) {
- string fileName = staticData.GetParam("output-word-graph")[0];
- std::ofstream *file = new std::ofstream;
- m_outputWordGraphStream = file;
- file->open(fileName.c_str());
- }
-
-
- // search graph output
- if (staticData.GetOutputSearchGraph()) {
- string fileName;
- if (staticData.GetOutputSearchGraphExtended())
- fileName = staticData.GetParam("output-search-graph-extended")[0];
- else
- fileName = staticData.GetParam("output-search-graph")[0];
- std::ofstream *file = new std::ofstream;
- m_outputSearchGraphStream = file;
- file->open(fileName.c_str());
- }
-
- // detailed translation reporting
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- const std::string &path = staticData.GetDetailedTranslationReportingFilePath();
- m_detailedTranslationReportingStream = new std::ofstream(path.c_str());
- UTIL_THROW_IF(!m_detailedTranslationReportingStream->good(),
- util::FileOpenException,
- "File for output of detailed translation report could not be open");
- }
-
- // sentence alignment output
- if (! staticData.GetAlignmentOutputFile().empty()) {
- m_alignmentOutputStream = new ofstream(staticData.GetAlignmentOutputFile().c_str());
- UTIL_THROW_IF(!m_alignmentOutputStream->good(),
- util::FileOpenException,
- "File for output of word alignment could not be open");
- }
-
-}
-
-InputType*
-IOWrapper::
-GetInput(InputType* inputType)
-{
- if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
- if (long x = inputType->GetTranslationId()) {
- if (x>=m_translationId) m_translationId = x+1;
- } else inputType->SetTranslationId(m_translationId++);
-
- return inputType;
- } else {
- delete inputType;
- return NULL;
- }
-}
-
-std::map<size_t, const Factor*> GetPlaceholders(const Hypothesis &hypo, FactorType placeholderFactor)
-{
- const InputPath &inputPath = hypo.GetTranslationOption().GetInputPath();
- const Phrase &inputPhrase = inputPath.GetPhrase();
-
- std::map<size_t, const Factor*> ret;
-
- for (size_t sourcePos = 0; sourcePos < inputPhrase.GetSize(); ++sourcePos) {
- const Factor *factor = inputPhrase.GetFactor(sourcePos, placeholderFactor);
- if (factor) {
- std::set<size_t> targetPos = hypo.GetTranslationOption().GetTargetPhrase().GetAlignTerm().GetAlignmentsForSource(sourcePos);
- UTIL_THROW_IF2(targetPos.size() != 1,
- "Placeholder should be aligned to 1, and only 1, word");
- ret[*targetPos.begin()] = factor;
- }
- }
-
- return ret;
-}
-
-/***
- * print surface factor only for the given phrase
- */
-void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
- char reportSegmentation, bool reportAllFactors)
-{
- UTIL_THROW_IF2(outputFactorOrder.size() == 0,
- "Must specific at least 1 output factor");
- const TargetPhrase& phrase = edge.GetCurrTargetPhrase();
- bool markUnknown = StaticData::Instance().GetMarkUnknown();
- if (reportAllFactors == true) {
- out << phrase;
- } else {
- FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
-
- std::map<size_t, const Factor*> placeholders;
- if (placeholderFactor != NOT_FOUND) {
- // creates map of target position -> factor for placeholders
- placeholders = GetPlaceholders(edge, placeholderFactor);
- }
-
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
-
- if (placeholders.size()) {
- // do placeholders
- std::map<size_t, const Factor*>::const_iterator iter = placeholders.find(pos);
- if (iter != placeholders.end()) {
- factor = iter->second;
- }
- }
-
- UTIL_THROW_IF2(factor == NULL,
- "No factor 0 at position " << pos);
-
- //preface surface form with UNK if marking unknowns
- const Word &word = phrase.GetWord(pos);
- if(markUnknown && word.IsOOV()) {
- out << "UNK" << *factor;
- } else {
- out << *factor;
- }
-
- for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
- UTIL_THROW_IF2(factor == NULL,
- "No factor " << i << " at position " << pos);
-
- out << "|" << *factor;
- }
- out << " ";
- }
- }
-
- // trace ("report segmentation") option "-t" / "-tt"
- if (reportSegmentation > 0 && phrase.GetSize() > 0) {
- const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
- const int sourceStart = sourceRange.GetStartPos();
- const int sourceEnd = sourceRange.GetEndPos();
- out << "|" << sourceStart << "-" << sourceEnd; // enriched "-tt"
- if (reportSegmentation == 2) {
- out << ",wa=";
- const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
- OutputAlignment(out, ai, 0, 0);
- out << ",total=";
- out << edge.GetScore() - edge.GetPrevHypo()->GetScore();
- out << ",";
- ScoreComponentCollection scoreBreakdown(edge.GetScoreBreakdown());
- scoreBreakdown.MinusEquals(edge.GetPrevHypo()->GetScoreBreakdown());
- OutputAllFeatureScores(scoreBreakdown, out);
- }
- out << "| ";
- }
-}
-
-void OutputBestSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder,
- char reportSegmentation, bool reportAllFactors)
-{
- if (hypo != NULL) {
- // recursively retrace this best path through the lattice, starting from the end of the hypothesis sentence
- OutputBestSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors);
- OutputSurface(out, *hypo, outputFactorOrder, reportSegmentation, reportAllFactors);
- }
-}
-
-void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset)
-{
- typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
- AlignVec alignments = ai.GetSortedAlignments();
-
- AlignVec::const_iterator it;
- for (it = alignments.begin(); it != alignments.end(); ++it) {
- const std::pair<size_t,size_t> &alignment = **it;
- out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
- }
-
-}
-
-void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
-{
- size_t targetOffset = 0;
-
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const TargetPhrase &tp = edge.GetCurrTargetPhrase();
- size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
-
- OutputAlignment(out, tp.GetAlignTerm(), sourceOffset, targetOffset);
-
- targetOffset += tp.GetSize();
- }
- out << std::endl;
-}
-
-void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
-{
- std::vector<const Hypothesis *> edges;
- const Hypothesis *currentHypo = hypo;
- while (currentHypo) {
- edges.push_back(currentHypo);
- currentHypo = currentHypo->GetPrevHypo();
- }
-
- OutputAlignment(out, edges);
-
-}
-
-void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
-{
- ostringstream out;
- OutputAlignment(out, edges);
-
- collector->Write(lineNo,out.str());
-}
-
-void OutputAlignment(OutputCollector* collector, size_t lineNo , const Hypothesis *hypo)
-{
- if (collector) {
- std::vector<const Hypothesis *> edges;
- const Hypothesis *currentHypo = hypo;
- while (currentHypo) {
- edges.push_back(currentHypo);
- currentHypo = currentHypo->GetPrevHypo();
- }
-
- OutputAlignment(collector,lineNo, edges);
- }
-}
-
-void OutputAlignment(OutputCollector* collector, size_t lineNo , const TrellisPath &path)
-{
- if (collector) {
- OutputAlignment(collector,lineNo, path.GetEdges());
- }
-}
-
-void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/, char reportSegmentation, bool reportAllFactors, std::ostream &out)
-{
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
-
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- OutputSurface(out, edge, StaticData::Instance().GetOutputFactorOrder(), reportSegmentation, reportAllFactors);
- }
- out << endl;
-}
-
-void IOWrapper::Backtrack(const Hypothesis *hypo)
-{
-
- if (hypo->GetPrevHypo() != NULL) {
- VERBOSE(3,hypo->GetId() << " <= ");
- Backtrack(hypo->GetPrevHypo());
- }
-}
-
-void OutputBestHypo(const std::vector<Word>& mbrBestHypo, long /*translationId*/, char /*reportSegmentation*/, bool /*reportAllFactors*/, ostream& out)
-{
-
- for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
- const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
- UTIL_THROW_IF2(factor == NULL,
- "No factor 0 at position " << i);
- if (i>0) out << " " << *factor;
- else out << *factor;
- }
- out << endl;
-}
-
-
-void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
-{
- if (hypo->GetPrevHypo()) {
- OutputInput(map, hypo->GetPrevHypo());
- map[hypo->GetCurrSourceWordsRange().GetStartPos()] = &hypo->GetTranslationOption().GetInputPath().GetPhrase();
- }
-}
-
-void OutputInput(std::ostream& os, const Hypothesis* hypo)
-{
- size_t len = hypo->GetInput().GetSize();
- std::vector<const Phrase*> inp_phrases(len, 0);
- OutputInput(inp_phrases, hypo);
- for (size_t i=0; i<len; ++i)
- if (inp_phrases[i]) os << *inp_phrases[i];
-}
-
-void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long /*translationId*/, char reportSegmentation, bool reportAllFactors)
-{
- if (hypo != NULL) {
- VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
- VERBOSE(3,"Best path: ");
- Backtrack(hypo);
- VERBOSE(3,"0" << std::endl);
- if (!m_surpressSingleBestOutput) {
- if (StaticData::Instance().GetOutputHypoScore()) {
- cout << hypo->GetTotalScore() << " ";
- }
-
- if (StaticData::Instance().IsPathRecoveryEnabled()) {
- OutputInput(cout, hypo);
- cout << "||| ";
- }
- OutputBestSurface(cout, hypo, m_outputFactorOrder, reportSegmentation, reportAllFactors);
- cout << endl;
- }
- } else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
- if (!m_surpressSingleBestOutput) {
- cout << endl;
- }
- }
-}
-
-void OutputNBest(std::ostream& out
- , const Moses::TrellisPathList &nBestList
- , const std::vector<Moses::FactorType>& outputFactorOrder
- , long translationId
- , char reportSegmentation)
-{
- const StaticData &staticData = StaticData::Instance();
- bool reportAllFactors = staticData.GetReportAllFactorsNBest();
- bool includeSegmentation = staticData.NBestIncludesSegmentation();
- bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
-
- TrellisPathList::const_iterator iter;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const TrellisPath &path = **iter;
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
-
- // print the surface factor of the translation
- out << translationId << " ||| ";
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- OutputSurface(out, edge, outputFactorOrder, reportSegmentation, reportAllFactors);
- }
- out << " |||";
-
- // print scores with feature names
- OutputAllFeatureScores(path.GetScoreBreakdown(), out );
-
- // total
- out << " ||| " << path.GetTotalScore();
-
- //phrase-to-phrase segmentation
- if (includeSegmentation) {
- out << " |||";
- for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
- WordsRange targetRange = path.GetTargetWordsRange(edge);
- out << " " << sourceRange.GetStartPos();
- if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
- out << "-" << sourceRange.GetEndPos();
- }
- out<< "=" << targetRange.GetStartPos();
- if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
- out<< "-" << targetRange.GetEndPos();
- }
- }
- }
-
- if (includeWordAlignment) {
- out << " ||| ";
- for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
- WordsRange targetRange = path.GetTargetWordsRange(edge);
- const int sourceOffset = sourceRange.GetStartPos();
- const int targetOffset = targetRange.GetStartPos();
- const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
-
- OutputAlignment(out, ai, sourceOffset, targetOffset);
-
- }
- }
-
- if (StaticData::Instance().IsPathRecoveryEnabled()) {
- out << " ||| ";
- OutputInput(out, edges[0]);
- }
-
- out << endl;
- }
-
- out << std::flush;
-}
-
-void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out)
-{
- std::string lastName = "";
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for( size_t i=0; i<sff.size(); i++ ) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->GetScoreProducerDescription() != "BleuScoreFeature"
- && ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- for( size_t i=0; i<slf.size(); i++ ) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
-}
-
-void OutputFeatureScores( std::ostream& out
- , const ScoreComponentCollection &features
- , const FeatureFunction *ff
- , std::string &lastName )
-{
- const StaticData &staticData = StaticData::Instance();
- bool labeledOutput = staticData.IsLabeledNBestList();
-
- // regular features (not sparse)
- if (ff->GetNumScoreComponents() != 0) {
- if( labeledOutput && lastName != ff->GetScoreProducerDescription() ) {
- lastName = ff->GetScoreProducerDescription();
- out << " " << lastName << "=";
- }
- vector<float> scores = features.GetScoresForProducer( ff );
- for (size_t j = 0; j<scores.size(); ++j) {
- out << " " << scores[j];
- }
- }
-
- // sparse features
- const FVector scores = features.GetVectorForProducer( ff );
- for(FVector::FNVmap::const_iterator i = scores.cbegin(); i != scores.cend(); i++) {
- out << " " << i->first << "= " << i->second;
- }
-}
-
-void OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>& solutions,long translationId)
-{
- for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
- out << translationId;
- out << " |||";
- const vector<Word> mbrHypo = si->GetWords();
- for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
- const Factor *factor = mbrHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
- if (i>0) out << " " << *factor;
- else out << *factor;
- }
- out << " |||";
- out << " map: " << si->GetMapScore();
- out << " w: " << mbrHypo.size();
- const vector<float>& ngramScores = si->GetNgramScores();
- for (size_t i = 0; i < ngramScores.size(); ++i) {
- out << " " << ngramScores[i];
- }
- out << " ||| " << si->GetScore();
-
- out << endl;
- }
-}
-
-
-void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solutions,long translationId)
-{
- OutputLatticeMBRNBest(*m_nBestStream, solutions,translationId);
-}
-
-bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
-{
- if (source) delete source;
- switch(inputType) {
- case SentenceInput:
- source = ioWrapper.GetInput(new Sentence);
- break;
- case ConfusionNetworkInput:
- source = ioWrapper.GetInput(new ConfusionNet);
- break;
- case WordLatticeInput:
- source = ioWrapper.GetInput(new WordLattice);
- break;
- default:
- TRACE_ERR("Unknown input type: " << inputType << "\n");
- source = NULL;
- }
- return (source ? true : false);
-}
-
-
-
-IOWrapper *GetIOWrapper(const StaticData &staticData)
-{
- IOWrapper *ioWrapper;
- const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
- ,&outputFactorOrder = staticData.GetOutputFactorOrder();
- FactorMask inputFactorUsed(inputFactorOrder);
-
- // io
- if (staticData.GetParam("input-file").size() == 1) {
- VERBOSE(2,"IO from File" << endl);
- string filePath = staticData.GetParam("input-file")[0];
-
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath()
- , filePath);
- } else {
- VERBOSE(1,"IO from STDOUT/STDIN" << endl);
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath());
- }
- ioWrapper->ResetTranslationId();
-
- IFVERBOSE(1)
- PrintUserTime("Created input-output object");
-
- return ioWrapper;
-}
-
-}
-
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
deleted file mode 100644
index ed2537986..000000000
--- a/moses-cmd/IOWrapper.h
+++ /dev/null
@@ -1,164 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#ifndef moses_cmd_IOWrapper_h
-#define moses_cmd_IOWrapper_h
-
-#include <cassert>
-#include <fstream>
-#include <ostream>
-#include <vector>
-
-#include "moses/TypeDef.h"
-#include "moses/Sentence.h"
-#include "moses/FactorTypeSet.h"
-#include "moses/FactorCollection.h"
-#include "moses/Hypothesis.h"
-#include "moses/OutputCollector.h"
-#include "moses/TrellisPathList.h"
-#include "moses/InputFileStream.h"
-#include "moses/InputType.h"
-#include "moses/WordLattice.h"
-#include "moses/LatticeMBR.h"
-
-namespace Moses
-{
-class ScoreComponentCollection;
-class Hypothesis;
-class Factor;
-}
-
-namespace MosesCmd
-{
-
-/** Helper class that holds misc variables to write data out to command line.
- */
-class IOWrapper
-{
-protected:
- long m_translationId;
-
- const std::vector<Moses::FactorType> &m_inputFactorOrder;
- const std::vector<Moses::FactorType> &m_outputFactorOrder;
- const Moses::FactorMask &m_inputFactorUsed;
- std::string m_inputFilePath;
- Moses::InputFileStream *m_inputFile;
- std::istream *m_inputStream;
- std::ostream *m_nBestStream
- ,*m_outputWordGraphStream,*m_outputSearchGraphStream;
- std::ostream *m_detailedTranslationReportingStream;
- std::ofstream *m_alignmentOutputStream;
- bool m_surpressSingleBestOutput;
-
- void Initialization(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath);
-
-
-public:
- IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath);
-
- IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &infilePath);
- ~IOWrapper();
-
- Moses::InputType* GetInput(Moses::InputType *inputType);
-
- void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, char reportSegmentation, bool reportAllFactors);
- void OutputLatticeMBRNBestList(const std::vector<LatticeMBRSolution>& solutions,long translationId);
- void Backtrack(const Moses::Hypothesis *hypo);
-
- void ResetTranslationId() {
- m_translationId = 0;
- }
-
- std::ofstream *GetAlignmentOutputStream() {
- return m_alignmentOutputStream;
- }
-
- std::ostream &GetOutputWordGraphStream() {
- return *m_outputWordGraphStream;
- }
- std::ostream &GetOutputSearchGraphStream() {
- return *m_outputSearchGraphStream;
- }
-
- std::ostream &GetDetailedTranslationReportingStream() {
- assert (m_detailedTranslationReportingStream);
- return *m_detailedTranslationReportingStream;
- }
-};
-
-IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
-bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source);
-void OutputLanguageModelOrder(std::ostream &out, const Moses::Hypothesis *hypo, Moses::Manager &manager);
-void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder, char reportSegmentation, bool reportAllFactors);
-void OutputLatticeMBRNBest(std::ostream& out, const std::vector<LatticeMBRSolution>& solutions,long translationId);
-void OutputBestHypo(const std::vector<Moses::Word>& mbrBestHypo, long /*translationId*/,
- char reportSegmentation, bool reportAllFactors, std::ostream& out);
-void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,char reportSegmentation, bool reportAllFactors, std::ostream &out);
-void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
-void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
-void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
-void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
-void OutputAlignment(std::ostream &out, const Moses::AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset);
-
-void OutputNBest(std::ostream& out
- , const Moses::TrellisPathList &nBestList
- , const std::vector<Moses::FactorType>& outputFactorOrder
- , long translationId
- , char reportSegmentation);
-void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out);
-void OutputFeatureScores( std::ostream& out
- , const Moses::ScoreComponentCollection &features
- , const Moses::FeatureFunction *ff
- , std::string &lastName );
-
-// creates a map of TARGET positions which should be replaced by word using placeholder
-std::map<size_t, const Moses::Factor*> GetPlaceholders(const Moses::Hypothesis &hypo, Moses::FactorType placeholderFactor);
-
-}
-
-#endif
diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile
index 8d54e0515..7ee90850c 100644
--- a/moses-cmd/Jamfile
+++ b/moses-cmd/Jamfile
@@ -1,4 +1,4 @@
-alias deps : IOWrapper.cpp mbr.cpp ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ;
+alias deps : ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ;
exe moses : Main.cpp deps ;
exe lmbrgrid : LatticeMBRGrid.cpp deps ;
diff --git a/moses-cmd/LatticeMBRGrid.cpp b/moses-cmd/LatticeMBRGrid.cpp
index 904275339..f00f40fd0 100644
--- a/moses-cmd/LatticeMBRGrid.cpp
+++ b/moses-cmd/LatticeMBRGrid.cpp
@@ -46,7 +46,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include <stdexcept>
#include <set>
-#include "IOWrapper.h"
+#include "moses/IOWrapper.h"
#include "moses/LatticeMBR.h"
#include "moses/Manager.h"
#include "moses/StaticData.h"
@@ -55,12 +55,11 @@ POSSIBILITY OF SUCH DAMAGE.
using namespace std;
using namespace Moses;
-using namespace MosesCmd;
//keys
enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale};
-namespace MosesCmd
+namespace Moses
{
class Grid
@@ -159,7 +158,7 @@ int main(int argc, char* argv[])
StaticData& staticData = const_cast<StaticData&>(StaticData::Instance());
staticData.SetUseLatticeMBR(true);
- IOWrapper* ioWrapper = GetIOWrapper(staticData);
+ IOWrapper* ioWrapper = IOWrapper::GetIOWrapper(staticData);
if (!ioWrapper) {
throw runtime_error("Failed to initialise IOWrapper");
@@ -178,10 +177,11 @@ int main(int argc, char* argv[])
const vector<float>& prune_grid = grid.getGrid(lmbr_prune);
const vector<float>& scale_grid = grid.getGrid(lmbr_scale);
- while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+ while(ioWrapper->ReadInput(staticData.GetInputType(),source)) {
++lineCount;
- Sentence sentence;
- Manager manager(lineCount, *source, staticData.GetSearchAlgorithm());
+ source->SetTranslationId(lineCount);
+
+ Manager manager(*source, staticData.GetSearchAlgorithm());
manager.ProcessSentence();
TrellisPathList nBestList;
manager.CalcNBest(nBestSize, nBestList,true);
@@ -200,7 +200,7 @@ int main(int argc, char* argv[])
staticData.SetMBRScale(scale);
cout << lineCount << " ||| " << p << " " << r << " " << prune << " " << scale << " ||| ";
vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
- OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
+ ioWrapper->OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
staticData.GetReportAllFactors(),cout);
}
}
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 713ded2e3..319aede20 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -34,389 +34,34 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//#include <vld.h>
#endif
-#include "moses/TranslationAnalysis.h"
-#include "IOWrapper.h"
-#include "mbr.h"
-
+#include "moses/IOWrapper.h"
#include "moses/Hypothesis.h"
#include "moses/HypergraphOutput.h"
#include "moses/Manager.h"
#include "moses/StaticData.h"
+#include "moses/TypeDef.h"
#include "moses/Util.h"
#include "moses/Timer.h"
-#include "moses/ThreadPool.h"
-#include "moses/OutputCollector.h"
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/TranslationTask.h"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
#endif
-using namespace std;
-using namespace Moses;
-using namespace MosesCmd;
-
-namespace MosesCmd
-{
-// output floats with five significant digits
-static const size_t PRECISION = 3;
-
-/** Enforce rounding */
-void fix(std::ostream& stream, size_t size)
-{
- stream.setf(std::ios::fixed);
- stream.precision(size);
-}
-
-/** Translates a sentence.
- * - calls the search (Manager)
- * - applies the decision rule
- * - outputs best translation and additional reporting
- **/
-class TranslationTask : public Task
-{
-
-public:
-
- TranslationTask(size_t lineNumber,
- InputType* source, OutputCollector* outputCollector, OutputCollector* nbestCollector,
- OutputCollector* latticeSamplesCollector,
- OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
- OutputCollector* detailedTranslationCollector,
- OutputCollector* alignmentInfoCollector,
- OutputCollector* unknownsCollector,
- bool outputSearchGraphSLF,
- boost::shared_ptr<HypergraphOutput<Manager> > hypergraphOutput) :
- m_source(source), m_lineNumber(lineNumber),
- m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
- m_latticeSamplesCollector(latticeSamplesCollector),
- m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
- m_detailedTranslationCollector(detailedTranslationCollector),
- m_alignmentInfoCollector(alignmentInfoCollector),
- m_unknownsCollector(unknownsCollector),
- m_outputSearchGraphSLF(outputSearchGraphSLF),
- m_hypergraphOutput(hypergraphOutput) {}
-
- /** Translate one sentence
- * gets called by main function implemented at end of this source file */
- void Run() {
- // shorthand for "global data"
- const StaticData &staticData = StaticData::Instance();
-
- // input sentence
- Sentence sentence;
-
- // report wall time spent on translation
- Timer translationTime;
- translationTime.start();
-
- // report thread number
-#if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
- TRACE_ERR("Translating line " << m_lineNumber << " in thread id " << pthread_self() << std::endl);
+#ifdef PT_UG
+#include <boost/foreach.hpp>
+#include "moses/TranslationModel/UG/mmsapt.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
#endif
+using namespace std;
+using namespace Moses;
- // execute the translation
- // note: this executes the search, resulting in a search graph
- // we still need to apply the decision rule (MAP, MBR, ...)
- Timer initTime;
- initTime.start();
- Manager manager(m_lineNumber, *m_source,staticData.GetSearchAlgorithm());
- VERBOSE(1, "Line " << m_lineNumber << ": Initialize search took " << initTime << " seconds total" << endl);
- manager.ProcessSentence();
-
- // we are done with search, let's look what we got
- Timer additionalReportingTime;
- additionalReportingTime.start();
-
- // output word graph
- if (m_wordGraphCollector) {
- ostringstream out;
- fix(out,PRECISION);
- manager.GetWordGraph(m_lineNumber, out);
- m_wordGraphCollector->Write(m_lineNumber, out.str());
- }
-
- // output search graph
- if (m_searchGraphCollector) {
- ostringstream out;
- fix(out,PRECISION);
- manager.OutputSearchGraph(m_lineNumber, out);
- m_searchGraphCollector->Write(m_lineNumber, out.str());
-
-#ifdef HAVE_PROTOBUF
- if (staticData.GetOutputSearchGraphPB()) {
- ostringstream sfn;
- sfn << staticData.GetParam("output-search-graph-pb")[0] << '/' << m_lineNumber << ".pb" << ends;
- string fn = sfn.str();
- VERBOSE(2, "Writing search graph to " << fn << endl);
- fstream output(fn.c_str(), ios::trunc | ios::binary | ios::out);
- manager.SerializeSearchGraphPB(m_lineNumber, output);
- }
-#endif
- }
-
- // Output search graph in HTK standard lattice format (SLF)
- if (m_outputSearchGraphSLF) {
- stringstream fileName;
- fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
- std::ofstream *file = new std::ofstream;
- file->open(fileName.str().c_str());
- if (file->is_open() && file->good()) {
- ostringstream out;
- fix(out,PRECISION);
- manager.OutputSearchGraphAsSLF(m_lineNumber, out);
- *file << out.str();
- file -> flush();
- } else {
- TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
- }
- delete file;
- }
-
- // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
- if (m_hypergraphOutput.get()) {
- m_hypergraphOutput->Write(manager);
- }
-
- additionalReportingTime.stop();
-
- // apply decision rule and output best translation(s)
- if (m_outputCollector) {
- ostringstream out;
- ostringstream debug;
- fix(debug,PRECISION);
-
- // all derivations - send them to debug stream
- if (staticData.PrintAllDerivations()) {
- additionalReportingTime.start();
- manager.PrintAllDerivations(m_lineNumber, debug);
- additionalReportingTime.stop();
- }
-
- Timer decisionRuleTime;
- decisionRuleTime.start();
-
- // MAP decoding: best hypothesis
- const Hypothesis* bestHypo = NULL;
- if (!staticData.UseMBR()) {
- bestHypo = manager.GetBestHypothesis();
- if (bestHypo) {
- if (StaticData::Instance().GetOutputHypoScore()) {
- out << bestHypo->GetTotalScore() << ' ';
- }
- if (staticData.IsPathRecoveryEnabled()) {
- OutputInput(out, bestHypo);
- out << "||| ";
- }
- if (staticData.GetParam("print-id").size() && Scan<bool>(staticData.GetParam("print-id")[0]) ) {
- out << m_source->GetTranslationId() << " ";
- }
-
- if (staticData.GetReportSegmentation() == 2) {
- manager.GetOutputLanguageModelOrder(out, bestHypo);
- }
- OutputBestSurface(
- out,
- bestHypo,
- staticData.GetOutputFactorOrder(),
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors());
- if (staticData.PrintAlignmentInfo()) {
- out << "||| ";
- OutputAlignment(out, bestHypo);
- }
-
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
- IFVERBOSE(1) {
- debug << "BEST TRANSLATION: " << *bestHypo << endl;
- }
- } else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
- }
-
- out << endl;
- }
-
- // MBR decoding (n-best MBR, lattice MBR, consensus)
- else {
- // we first need the n-best translations
- size_t nBestSize = staticData.GetMBRSize();
- if (nBestSize <= 0) {
- cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl;
- exit(1);
- }
- TrellisPathList nBestList;
- manager.CalcNBest(nBestSize, nBestList,true);
- VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl);
- IFVERBOSE(2) {
- PrintUserTime("calculated n-best list for (L)MBR decoding");
- }
-
- // lattice MBR
- if (staticData.UseLatticeMBR()) {
- if (m_nbestCollector) {
- //lattice mbr nbest
- vector<LatticeMBRSolution> solutions;
- size_t n = min(nBestSize, staticData.GetNBestSize());
- getLatticeMBRNBest(manager,nBestList,solutions,n);
- ostringstream out;
- OutputLatticeMBRNBest(out, solutions,m_lineNumber);
- m_nbestCollector->Write(m_lineNumber, out.str());
- } else {
- //Lattice MBR decoding
- vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
- OutputBestHypo(mbrBestHypo, m_lineNumber, staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- IFVERBOSE(2) {
- PrintUserTime("finished Lattice MBR decoding");
- }
- }
- }
-
- // consensus decoding
- else if (staticData.UseConsensusDecoding()) {
- const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
- OutputBestHypo(conBestHypo, m_lineNumber,
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, conBestHypo);
- IFVERBOSE(2) {
- PrintUserTime("finished Consensus decoding");
- }
- }
-
- // n-best MBR decoding
- else {
- const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
- OutputBestHypo(mbrBestHypo, m_lineNumber,
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, mbrBestHypo);
- IFVERBOSE(2) {
- PrintUserTime("finished MBR decoding");
- }
- }
- }
-
- // report best translation to output collector
- m_outputCollector->Write(m_lineNumber,out.str(),debug.str());
-
- decisionRuleTime.stop();
- VERBOSE(1, "Line " << m_lineNumber << ": Decision rule took " << decisionRuleTime << " seconds total" << endl);
- }
-
- additionalReportingTime.start();
-
- // output n-best list
- if (m_nbestCollector && !staticData.UseLatticeMBR()) {
- TrellisPathList nBestList;
- ostringstream out;
- manager.CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest());
- OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
- m_nbestCollector->Write(m_lineNumber, out.str());
- }
-
- //lattice samples
- if (m_latticeSamplesCollector) {
- TrellisPathList latticeSamples;
- ostringstream out;
- manager.CalcLatticeSamples(staticData.GetLatticeSamplesSize(), latticeSamples);
- OutputNBest(out,latticeSamples, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
- m_latticeSamplesCollector->Write(m_lineNumber, out.str());
- }
-
- // detailed translation reporting
- if (m_detailedTranslationCollector) {
- ostringstream out;
- fix(out,PRECISION);
- TranslationAnalysis::PrintTranslationAnalysis(out, manager.GetBestHypothesis());
- m_detailedTranslationCollector->Write(m_lineNumber,out.str());
- }
-
- //list of unknown words
- if (m_unknownsCollector) {
- const vector<const Phrase*>& unknowns = manager.getSntTranslationOptions()->GetUnknownSources();
- ostringstream out;
- for (size_t i = 0; i < unknowns.size(); ++i) {
- out << *(unknowns[i]);
- }
- out << endl;
- m_unknownsCollector->Write(m_lineNumber, out.str());
- }
-
- // report additional statistics
- manager.CalcDecoderStatistics();
- VERBOSE(1, "Line " << m_lineNumber << ": Additional reporting took " << additionalReportingTime << " seconds total" << endl);
- VERBOSE(1, "Line " << m_lineNumber << ": Translation took " << translationTime << " seconds total" << endl);
- IFVERBOSE(2) {
- PrintUserTime("Sentence Decoding Time:");
- }
- }
-
- ~TranslationTask() {
- delete m_source;
- }
-
-private:
- InputType* m_source;
- size_t m_lineNumber;
- OutputCollector* m_outputCollector;
- OutputCollector* m_nbestCollector;
- OutputCollector* m_latticeSamplesCollector;
- OutputCollector* m_wordGraphCollector;
- OutputCollector* m_searchGraphCollector;
- OutputCollector* m_detailedTranslationCollector;
- OutputCollector* m_alignmentInfoCollector;
- OutputCollector* m_unknownsCollector;
- bool m_outputSearchGraphSLF;
- boost::shared_ptr<HypergraphOutput<Manager> > m_hypergraphOutput;
- std::ofstream *m_alignmentStream;
-
-
-};
-
-static void PrintFeatureWeight(const FeatureFunction* ff)
+namespace Moses
{
- cout << ff->GetScoreProducerDescription() << "=";
- size_t numScoreComps = ff->GetNumScoreComponents();
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- for (size_t i = 0; i < numScoreComps; ++i) {
- cout << " " << values[i];
- }
- cout << endl;
-}
-
-static void ShowWeights()
-{
- //TODO: Find a way of ensuring this order is synced with the nbest
- fix(cout,6);
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
-
- for (size_t i = 0; i < sff.size(); ++i) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- }
- else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- }
- else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
-}
void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
{
@@ -445,8 +90,8 @@ int main(int argc, char** argv)
}
// set number of significant decimals in output
- fix(cout,PRECISION);
- fix(cerr,PRECISION);
+ IOWrapper::FixPrecision(cout);
+ IOWrapper::FixPrecision(cerr);
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
@@ -476,7 +121,7 @@ int main(int argc, char** argv)
srand(time(NULL));
// set up read/writing class
- IOWrapper* ioWrapper = GetIOWrapper(staticData);
+ IOWrapper* ioWrapper = IOWrapper::GetIOWrapper(staticData);
if (!ioWrapper) {
cerr << "Error; Failed to create IO object" << endl;
exit(1);
@@ -489,92 +134,17 @@ int main(int argc, char** argv)
TRACE_ERR(weights);
TRACE_ERR("\n");
}
- boost::shared_ptr<HypergraphOutput<Manager> > hypergraphOutput;
- if (staticData.GetOutputSearchGraphHypergraph()) {
- hypergraphOutput.reset(new HypergraphOutput<Manager>(PRECISION));
- }
-
- // initialize output streams
- // note: we can't just write to STDOUT or files
- // because multithreading may return sentences in shuffled order
- auto_ptr<OutputCollector> outputCollector; // for translations
- auto_ptr<OutputCollector> nbestCollector; // for n-best lists
- auto_ptr<OutputCollector> latticeSamplesCollector; //for lattice samples
- auto_ptr<ofstream> nbestOut;
- auto_ptr<ofstream> latticeSamplesOut;
- size_t nbestSize = staticData.GetNBestSize();
- string nbestFile = staticData.GetNBestFilePath();
- bool output1best = true;
- if (nbestSize) {
- if (nbestFile == "-" || nbestFile == "/dev/stdout") {
- // nbest to stdout, no 1-best
- nbestCollector.reset(new OutputCollector());
- output1best = false;
- } else {
- // nbest to file, 1-best to stdout
- nbestOut.reset(new ofstream(nbestFile.c_str()));
- if (!nbestOut->good()) {
- TRACE_ERR("ERROR: Failed to open " << nbestFile << " for nbest lists" << endl);
- exit(1);
- }
- nbestCollector.reset(new OutputCollector(nbestOut.get()));
- }
- }
- size_t latticeSamplesSize = staticData.GetLatticeSamplesSize();
- string latticeSamplesFile = staticData.GetLatticeSamplesFilePath();
- if (latticeSamplesSize) {
- if (latticeSamplesFile == "-" || latticeSamplesFile == "/dev/stdout") {
- latticeSamplesCollector.reset(new OutputCollector());
- output1best = false;
- } else {
- latticeSamplesOut.reset(new ofstream(latticeSamplesFile.c_str()));
- if (!latticeSamplesOut->good()) {
- TRACE_ERR("ERROR: Failed to open " << latticeSamplesFile << " for lattice samples" << endl);
- exit(1);
- }
- latticeSamplesCollector.reset(new OutputCollector(latticeSamplesOut.get()));
- }
- }
- if (output1best) {
- outputCollector.reset(new OutputCollector());
- }
-
- // initialize stream for word graph (aka: output lattice)
- auto_ptr<OutputCollector> wordGraphCollector;
- if (staticData.GetOutputWordGraph()) {
- wordGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputWordGraphStream())));
- }
-
- // initialize stream for search graph
- // note: this is essentially the same as above, but in a different format
- auto_ptr<OutputCollector> searchGraphCollector;
- if (staticData.GetOutputSearchGraph()) {
- searchGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputSearchGraphStream())));
- }
-
- // initialize stram for details about the decoder run
- auto_ptr<OutputCollector> detailedTranslationCollector;
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- detailedTranslationCollector.reset(new OutputCollector(&(ioWrapper->GetDetailedTranslationReportingStream())));
- }
-
- // initialize stram for word alignment between input and output
- auto_ptr<OutputCollector> alignmentInfoCollector;
- if (!staticData.GetAlignmentOutputFile().empty()) {
- alignmentInfoCollector.reset(new OutputCollector(ioWrapper->GetAlignmentOutputStream()));
- }
+ boost::shared_ptr<HypergraphOutput<Manager> > hypergraphOutput;
+ boost::shared_ptr<HypergraphOutput<ChartManager> > hypergraphOutputChart;
- //initialise stream for unknown (oov) words
- auto_ptr<OutputCollector> unknownsCollector;
- auto_ptr<ofstream> unknownsStream;
- if (!staticData.GetOutputUnknownsFile().empty()) {
- unknownsStream.reset(new ofstream(staticData.GetOutputUnknownsFile().c_str()));
- if (!unknownsStream->good()) {
- TRACE_ERR("Unable to open " << staticData.GetOutputUnknownsFile() << " for unknowns");
- exit(1);
- }
- unknownsCollector.reset(new OutputCollector(unknownsStream.get()));
+ if (staticData.GetOutputSearchGraphHypergraph()) {
+ if (staticData.IsChart()) {
+ hypergraphOutputChart.reset(new HypergraphOutput<ChartManager>(PRECISION));
+ }
+ else {
+ hypergraphOutput.reset(new HypergraphOutput<Manager>(PRECISION));
+ }
}
#ifdef WITH_THREADS
@@ -584,7 +154,8 @@ int main(int argc, char** argv)
// main loop over set of input sentences
InputType* source = NULL;
size_t lineCount = staticData.GetStartTranslationId();
- while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+ while(ioWrapper->ReadInput(staticData.GetInputType(),source)) {
+ source->SetTranslationId(lineCount);
IFVERBOSE(1) {
ResetUserTime();
}
@@ -592,19 +163,44 @@ int main(int argc, char** argv)
FeatureFunction::CallChangeSource(source);
// set up task of translating one sentence
- TranslationTask* task =
- new TranslationTask(lineCount,source, outputCollector.get(),
- nbestCollector.get(),
- latticeSamplesCollector.get(),
- wordGraphCollector.get(),
- searchGraphCollector.get(),
- detailedTranslationCollector.get(),
- alignmentInfoCollector.get(),
- unknownsCollector.get(),
- staticData.GetOutputSearchGraphSLF(),
- hypergraphOutput);
+ TranslationTask* task;
+ if (staticData.IsChart()) {
+ // scfg
+ task = new TranslationTask(source, *ioWrapper, hypergraphOutputChart);
+ }
+ else {
+ // pb
+ task = new TranslationTask(source, *ioWrapper,
+ staticData.GetOutputSearchGraphSLF(),
+ hypergraphOutput);
+ }
+
// execute task
#ifdef WITH_THREADS
+#ifdef PT_UG
+ bool spe = params.isParamSpecified("spe-src");
+ if (spe) {
+ // simulated post-editing: always run single-threaded!
+ task->Run();
+ delete task;
+ string src,trg,aln;
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_src,src), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_trg,trg), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_aln,aln), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ BOOST_FOREACH (PhraseDictionary* pd, PhraseDictionary::GetColl())
+ {
+ Mmsapt* sapt = dynamic_cast<Mmsapt*>(pd);
+ if (sapt) sapt->add(src,trg,aln);
+ VERBOSE(1,"[" << HERE << " added src] " << src << endl);
+ VERBOSE(1,"[" << HERE << " added trg] " << trg << endl);
+ VERBOSE(1,"[" << HERE << " added aln] " << aln << endl);
+ }
+ }
+ else
+#endif
pool.Submit(task);
#else
task->Run();
diff --git a/moses-cmd/Main.h b/moses-cmd/Main.h
index 362c1f245..49fee0219 100644
--- a/moses-cmd/Main.h
+++ b/moses-cmd/Main.h
@@ -1,3 +1,4 @@
+#pragma once
// $Id$
/***********************************************************************
@@ -32,12 +33,10 @@ POSSIBILITY OF SUCH DAMAGE.
// example file on how to use moses library
-#ifndef moses_cmd_Main_h
-#define moses_cmd_Main_h
#include "moses/StaticData.h"
class IOWrapper;
int main(int argc, char* argv[]);
-#endif
+
diff --git a/moses-cmd/simulate-pe.cc b/moses-cmd/simulate-pe.cc
index 9678e26c7..f05c0f510 100644
--- a/moses-cmd/simulate-pe.cc
+++ b/moses-cmd/simulate-pe.cc
@@ -20,44 +20,41 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/iostreams/device/file.hpp>
-#include <boost/iostreams/filter/bzip2.hpp>
-#include <boost/iostreams/filter/gzip.hpp>
-#include <boost/iostreams/filtering_stream.hpp>
-#include <boost/foreach.hpp>
-
+/**
+ * Moses main, for single-threaded and multi-threaded.
+ **/
#include <exception>
#include <fstream>
#include <sstream>
#include <vector>
+#include <boost/program_options.hpp>
#include "util/usage.hh"
#include "util/exception.hh"
#include "moses/Util.h"
+
+#ifdef PT_UG
#include "moses/TranslationModel/UG/mmsapt.h"
#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
+#endif
#ifdef WIN32
// Include Visual Leak Detector
//#include <vld.h>
#endif
-#include "moses/TranslationAnalysis.h"
-#include "IOWrapper.h"
-#include "mbr.h"
-
+#include "moses/IOWrapper.h"
#include "moses/Hypothesis.h"
+#include "moses/HypergraphOutput.h"
#include "moses/Manager.h"
#include "moses/StaticData.h"
+#include "moses/TypeDef.h"
#include "moses/Util.h"
#include "moses/Timer.h"
-#include "moses/ThreadPool.h"
-#include "moses/OutputCollector.h"
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/TranslationTask.h"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
@@ -65,522 +62,15 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
using namespace Moses;
-using namespace MosesCmd;
-
-namespace MosesCmd
-{
-// output floats with five significant digits
-static const size_t PRECISION = 3;
-
-/** Enforce rounding */
-void fix(std::ostream& stream, size_t size)
-{
- stream.setf(std::ios::fixed);
- stream.precision(size);
-}
-/** Translates a sentence.
- * - calls the search (Manager)
- * - applies the decision rule
- * - outputs best translation and additional reporting
- **/
-class TranslationTask : public Task
+namespace Moses
{
-public:
-
- TranslationTask(size_t lineNumber,
- InputType* source,
- OutputCollector* outputCollector,
- OutputCollector* nbestCollector,
- OutputCollector* latticeSamplesCollector,
- OutputCollector* wordGraphCollector,
- OutputCollector* searchGraphCollector,
- OutputCollector* detailedTranslationCollector,
- OutputCollector* alignmentInfoCollector,
- OutputCollector* unknownsCollector,
- bool outputSearchGraphSLF,
- bool outputSearchGraphHypergraph)
- : m_source(source)
- , m_lineNumber(lineNumber)
- , m_outputCollector(outputCollector)
- , m_nbestCollector(nbestCollector)
- , m_latticeSamplesCollector(latticeSamplesCollector)
- , m_wordGraphCollector(wordGraphCollector)
- , m_searchGraphCollector(searchGraphCollector)
- , m_detailedTranslationCollector(detailedTranslationCollector)
- , m_alignmentInfoCollector(alignmentInfoCollector)
- , m_unknownsCollector(unknownsCollector)
- , m_outputSearchGraphSLF(outputSearchGraphSLF)
- , m_outputSearchGraphHypergraph(outputSearchGraphHypergraph)
- { }
-
- /** Translate one sentence
- * gets called by main function implemented at end of this source file */
- void Run() {
- // shorthand for "global data"
- const StaticData &staticData = StaticData::Instance();
-
- // input sentence
- Sentence sentence;
-
- // report wall time spent on translation
- Timer translationTime;
- translationTime.start();
-
- // report thread number
-#if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
- TRACE_ERR("Translating line " << m_lineNumber << " in thread id " << pthread_self() << std::endl);
-#endif
-
-
- // execute the translation
- // note: this executes the search, resulting in a search graph
- // we still need to apply the decision rule (MAP, MBR, ...)
- Timer initTime;
- initTime.start();
- Manager manager(m_lineNumber, *m_source,staticData.GetSearchAlgorithm());
- VERBOSE(1, "Line " << m_lineNumber << ": Initialize search took " << initTime << " seconds total" << endl);
- manager.ProcessSentence();
-
- // we are done with search, let's look what we got
- Timer additionalReportingTime;
- additionalReportingTime.start();
-
- // output word graph
- if (m_wordGraphCollector) {
- ostringstream out;
- fix(out,PRECISION);
- manager.GetWordGraph(m_lineNumber, out);
- m_wordGraphCollector->Write(m_lineNumber, out.str());
- }
-
- // output search graph
- if (m_searchGraphCollector) {
- ostringstream out;
- fix(out,PRECISION);
- manager.OutputSearchGraph(m_lineNumber, out);
- m_searchGraphCollector->Write(m_lineNumber, out.str());
-
-#ifdef HAVE_PROTOBUF
- if (staticData.GetOutputSearchGraphPB()) {
- ostringstream sfn;
- sfn << staticData.GetParam("output-search-graph-pb")[0] << '/' << m_lineNumber << ".pb" << ends;
- string fn = sfn.str();
- VERBOSE(2, "Writing search graph to " << fn << endl);
- fstream output(fn.c_str(), ios::trunc | ios::binary | ios::out);
- manager.SerializeSearchGraphPB(m_lineNumber, output);
- }
-#endif
- }
-
- // Output search graph in HTK standard lattice format (SLF)
- if (m_outputSearchGraphSLF) {
- stringstream fileName;
- fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
- std::ofstream *file = new std::ofstream;
- file->open(fileName.str().c_str());
- if (file->is_open() && file->good()) {
- ostringstream out;
- fix(out,PRECISION);
- manager.OutputSearchGraphAsSLF(m_lineNumber, out);
- *file << out.str();
- file -> flush();
- } else {
- TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
- }
- delete file;
- }
-
- // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
- if (m_outputSearchGraphHypergraph) {
-
- vector<string> hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph");
-
- bool appendSuffix;
- if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
- appendSuffix = true;
- } else {
- appendSuffix = false;
- }
-
- string compression;
- if (hypergraphParameters.size() > 1) {
- compression = hypergraphParameters[1];
- } else {
- compression = "txt";
- }
-
- string hypergraphDir;
- if ( hypergraphParameters.size() > 2 ) {
- hypergraphDir = hypergraphParameters[2];
- } else {
- string nbestFile = staticData.GetNBestFilePath();
- if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
- boost::filesystem::path nbestPath(nbestFile);
-
- // In the Boost filesystem API version 2,
- // which was the default prior to Boost 1.46,
- // the filename() method returned a string.
- //
- // In the Boost filesystem API version 3,
- // which is the default starting with Boost 1.46,
- // the filename() method returns a path object.
- //
- // To get a string from the path object,
- // the native() method must be called.
- // hypergraphDir = nbestPath.parent_path().filename()
- //#if BOOST_VERSION >= 104600
- // .native()
- //#endif
- //;
-
- // Hopefully the following compiles under all versions of Boost.
- //
- // If this line gives you compile errors,
- // contact Lane Schwartz on the Moses mailing list
- hypergraphDir = nbestPath.parent_path().string();
-
- } else {
- stringstream hypergraphDirName;
- hypergraphDirName << boost::filesystem::current_path().string() << "/hypergraph";
- hypergraphDir = hypergraphDirName.str();
- }
- }
-
- if ( ! boost::filesystem::exists(hypergraphDir) ) {
- boost::filesystem::create_directory(hypergraphDir);
- }
-
- if ( ! boost::filesystem::exists(hypergraphDir) ) {
- TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl);
- } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) {
- TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
- } else {
- stringstream fileName;
- fileName << hypergraphDir << "/" << m_lineNumber;
- if ( appendSuffix ) {
- fileName << "." << compression;
- }
- boost::iostreams::filtering_ostream *file
- = new boost::iostreams::filtering_ostream;
-
- if ( compression == "gz" ) {
- file->push( boost::iostreams::gzip_compressor() );
- } else if ( compression == "bz2" ) {
- file->push( boost::iostreams::bzip2_compressor() );
- } else if ( compression != "txt" ) {
- TRACE_ERR("Unrecognized hypergraph compression format ("
- << compression
- << ") - using uncompressed plain txt" << std::endl);
- compression = "txt";
- }
-
- file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
-
- if (file->is_complete() && file->good()) {
- fix(*file,PRECISION);
- manager.OutputSearchGraphAsHypergraph(*file);
- file -> flush();
- } else {
- TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber
- << " because the output file " << fileName.str()
- << " is not open or not ready for writing"
- << std::endl);
- }
- file -> pop();
- delete file;
- }
- }
- additionalReportingTime.stop();
-
- // apply decision rule and output best translation(s)
- if (m_outputCollector) {
- ostringstream out;
- ostringstream debug;
- fix(debug,PRECISION);
-
- // all derivations - send them to debug stream
- if (staticData.PrintAllDerivations()) {
- additionalReportingTime.start();
- manager.PrintAllDerivations(m_lineNumber, debug);
- additionalReportingTime.stop();
- }
-
- Timer decisionRuleTime;
- decisionRuleTime.start();
-
- // MAP decoding: best hypothesis
- const Hypothesis* bestHypo = NULL;
- if (!staticData.UseMBR()) {
- bestHypo = manager.GetBestHypothesis();
- if (bestHypo) {
- if (StaticData::Instance().GetOutputHypoScore()) {
- out << bestHypo->GetTotalScore() << ' ';
- }
- if (staticData.IsPathRecoveryEnabled()) {
- OutputInput(out, bestHypo);
- out << "||| ";
- }
- if (staticData.GetParam("print-id").size() && Scan<bool>(staticData.GetParam("print-id")[0]) ) {
- out << m_source->GetTranslationId() << " ";
- }
-
- if (staticData.GetReportSegmentation() == 2) {
- manager.GetOutputLanguageModelOrder(out, bestHypo);
- }
- OutputBestSurface(
- out,
- bestHypo,
- staticData.GetOutputFactorOrder(),
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors());
- if (staticData.PrintAlignmentInfo()) {
- out << "||| ";
- OutputAlignment(out, bestHypo);
- }
-
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
- IFVERBOSE(1) {
- debug << "BEST TRANSLATION: " << *bestHypo << endl;
- }
- } else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
- }
-
- out << endl;
- }
-
- // MBR decoding (n-best MBR, lattice MBR, consensus)
- else {
- // we first need the n-best translations
- size_t nBestSize = staticData.GetMBRSize();
- if (nBestSize <= 0) {
- cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl;
- exit(1);
- }
- TrellisPathList nBestList;
- manager.CalcNBest(nBestSize, nBestList,true);
- VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl);
- IFVERBOSE(2) {
- PrintUserTime("calculated n-best list for (L)MBR decoding");
- }
-
- // lattice MBR
- if (staticData.UseLatticeMBR()) {
- if (m_nbestCollector) {
- //lattice mbr nbest
- vector<LatticeMBRSolution> solutions;
- size_t n = min(nBestSize, staticData.GetNBestSize());
- getLatticeMBRNBest(manager,nBestList,solutions,n);
- ostringstream out;
- OutputLatticeMBRNBest(out, solutions,m_lineNumber);
- m_nbestCollector->Write(m_lineNumber, out.str());
- } else {
- //Lattice MBR decoding
- vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
- OutputBestHypo(mbrBestHypo, m_lineNumber, staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- IFVERBOSE(2) {
- PrintUserTime("finished Lattice MBR decoding");
- }
- }
- }
-
- // consensus decoding
- else if (staticData.UseConsensusDecoding()) {
- const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
- OutputBestHypo(conBestHypo, m_lineNumber,
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, conBestHypo);
- IFVERBOSE(2) {
- PrintUserTime("finished Consensus decoding");
- }
- }
-
- // n-best MBR decoding
- else {
- const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
- OutputBestHypo(mbrBestHypo, m_lineNumber,
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, mbrBestHypo);
- IFVERBOSE(2) {
- PrintUserTime("finished MBR decoding");
- }
- }
- }
-
- // report best translation to output collector
- m_outputCollector->Write(m_lineNumber,out.str(),debug.str());
-
- decisionRuleTime.stop();
- VERBOSE(1, "Line " << m_lineNumber << ": Decision rule took " << decisionRuleTime << " seconds total" << endl);
- }
-
- additionalReportingTime.start();
-
- // output n-best list
- if (m_nbestCollector && !staticData.UseLatticeMBR()) {
- TrellisPathList nBestList;
- ostringstream out;
- manager.CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest());
- OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
- m_nbestCollector->Write(m_lineNumber, out.str());
- }
-
- //lattice samples
- if (m_latticeSamplesCollector) {
- TrellisPathList latticeSamples;
- ostringstream out;
- manager.CalcLatticeSamples(staticData.GetLatticeSamplesSize(), latticeSamples);
- OutputNBest(out,latticeSamples, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
- m_latticeSamplesCollector->Write(m_lineNumber, out.str());
- }
-
- // detailed translation reporting
- if (m_detailedTranslationCollector) {
- ostringstream out;
- fix(out,PRECISION);
- TranslationAnalysis::PrintTranslationAnalysis(out, manager.GetBestHypothesis());
- m_detailedTranslationCollector->Write(m_lineNumber,out.str());
- }
-
- //list of unknown words
- if (m_unknownsCollector) {
- const vector<const Phrase*>& unknowns = manager.getSntTranslationOptions()->GetUnknownSources();
- ostringstream out;
- for (size_t i = 0; i < unknowns.size(); ++i) {
- out << *(unknowns[i]);
- }
- out << endl;
- m_unknownsCollector->Write(m_lineNumber, out.str());
- }
-
- // report additional statistics
- manager.CalcDecoderStatistics();
- VERBOSE(1, "Line " << m_lineNumber << ": Additional reporting took " << additionalReportingTime << " seconds total" << endl);
- VERBOSE(1, "Line " << m_lineNumber << ": Translation took " << translationTime << " seconds total" << endl);
- IFVERBOSE(2) {
- PrintUserTime("Sentence Decoding Time:");
- }
- }
-
- ~TranslationTask() {
- delete m_source;
- }
-
-private:
- InputType* m_source;
- size_t m_lineNumber;
- OutputCollector* m_outputCollector;
- OutputCollector* m_nbestCollector;
- OutputCollector* m_latticeSamplesCollector;
- OutputCollector* m_wordGraphCollector;
- OutputCollector* m_searchGraphCollector;
- OutputCollector* m_detailedTranslationCollector;
- OutputCollector* m_alignmentInfoCollector;
- OutputCollector* m_unknownsCollector;
- bool m_outputSearchGraphSLF;
- bool m_outputSearchGraphHypergraph;
- std::ofstream *m_alignmentStream;
-
-
-};
-
-static void PrintFeatureWeight(const FeatureFunction* ff)
-{
- cout << ff->GetScoreProducerDescription() << "=";
- size_t numScoreComps = ff->GetNumScoreComponents();
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- for (size_t i = 0; i < numScoreComps; ++i) {
- cout << " " << values[i];
- }
- cout << endl;
-}
-
-static void ShowWeights()
-{
- //TODO: Find a way of ensuring this order is synced with the nbest
- fix(cout,6);
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
-
- for (size_t i = 0; i < sff.size(); ++i) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- }
- else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- }
- else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
-}
-
-size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
-{
- size_t numScoreComps = ff->GetNumScoreComponents();
- if (numScoreComps != 0) {
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- if (numScoreComps > 1) {
- for (size_t i = 0; i < numScoreComps; ++i) {
- outputSearchGraphStream << ff->GetScoreProducerDescription()
- << i
- << "=" << values[i] << endl;
- }
- } else {
- outputSearchGraphStream << ff->GetScoreProducerDescription()
- << "=" << values[0] << endl;
- }
- return index+numScoreComps;
- } else {
- UTIL_THROW2("Sparse features are not yet supported when outputting hypergraph format");
- }
-}
-
void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
-
- const vector<const StatelessFeatureFunction*>& slf =StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- size_t featureIndex = 1;
- for (size_t i = 0; i < sff.size(); ++i) {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- /*
- if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
- slf[i]->GetScoreProducerWeightShortName() != "tm" &&
- slf[i]->GetScoreProducerWeightShortName() != "I" &&
- slf[i]->GetScoreProducerWeightShortName() != "g")
- */
- {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
- }
- }
- const vector<PhraseDictionary*>& pds = PhraseDictionary::GetColl();
- for( size_t i=0; i<pds.size(); i++ ) {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
- }
- const vector<GenerationDictionary*>& gds = GenerationDictionary::GetColl();
- for( size_t i=0; i<gds.size(); i++ ) {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
- }
-
+ StaticData::Instance().GetAllWeights().Save(outputSearchGraphStream);
}
@@ -603,9 +93,36 @@ int main(int argc, char** argv)
}
// set number of significant decimals in output
- fix(cout,PRECISION);
- fix(cerr,PRECISION);
-
+ // By the way, having a static member function in an Moses-specific class
+ // just to do
+ // cout.setf(ios::fixed);cout.precision(3);
+ // doesn't make sense.
+ // 1. It makes the program harder to understand. If I see
+ // cout.setf(ios::fixed);cout.precision(3);
+ // I know immediately what's going on. If I see,
+ // IOWrapper::FixPrecision(cout);
+ // I assume that something much more elaborate is going on (side effects?)
+ // and need to look it up.
+ // After all, one reads programs in order to understand them.
+ // 2. If you want to enforce a certain default behaviour on stdout,
+ // that should happen only once. Other components should not do that.
+ // 3. If you want to format locally, use
+ // string IOWrapper::score_default_formatting = "%.3f";
+ // cout << boost::format(IOWrapper::score_default_formatting) % my_variable;
+ // This even allows you to do pretty printing by default if you wish.
+ // (Actually, the format variable should be constructed
+ // from an int IOWrapper::score_output_default_precision itself.)
+ // 4. If output speed is an issue and you are afraid of the overhead that
+ // boost::format may add, don't use streams to begin with.
+ // IOWrapper::FixPrecision(cout);
+ // IOWrapper::FixPrecision(cerr);
+ cout.setf(ios::fixed); cout.precision(3);
+ cout.setf(ios::fixed); cout.precision(3);
+
+#ifdef PT_UG
+ // see my notes on the moses support mailing list, this is a hack around
+ // the misguided implementation of LoadParameter(), which should leave
+ // arguments it doesn't know alone.
vector<pair<string,int> > argfilter(4);
argfilter[0] = std::make_pair(string("--spe-src"),1);
argfilter[1] = std::make_pair(string("--spe-trg"),1);
@@ -616,16 +133,16 @@ int main(int argc, char** argv)
filter_arguments(argc, argv, mo_acnt, &mo_args, my_acnt, &my_args, argfilter);
ifstream spe_src,spe_trg,spe_aln;
- // instead of translating show coverage by phrase tables
for (int i = 0; i < my_acnt; i += 2)
{
- if (!strcmp(my_args[i],"--spe-src"))
- spe_src.open(my_args[i+1]);
- else if (!strcmp(my_args[i],"--spe-trg"))
- spe_trg.open(my_args[i+1]);
- else if (!strcmp(my_args[i],"--spe-aln"))
- spe_aln.open(my_args[i+1]);
+ if (!strcmp(my_args[i],"--spe-src"))
+ spe_src.open(my_args[i+1]);
+ else if (!strcmp(my_args[i],"--spe-trg"))
+ spe_trg.open(my_args[i+1]);
+ else if (!strcmp(my_args[i],"--spe-aln"))
+ spe_aln.open(my_args[i+1]);
}
+#endif
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
@@ -634,7 +151,6 @@ int main(int argc, char** argv)
exit(1);
}
-
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(&params, argv[0])) {
@@ -650,12 +166,11 @@ int main(int argc, char** argv)
// shorthand for accessing information in StaticData
const StaticData& staticData = StaticData::Instance();
-
//initialise random numbers
srand(time(NULL));
// set up read/writing class
- IOWrapper* ioWrapper = GetIOWrapper(staticData);
+ IOWrapper* ioWrapper = IOWrapper::GetIOWrapper(staticData);
if (!ioWrapper) {
cerr << "Error; Failed to create IO object" << endl;
exit(1);
@@ -668,113 +183,17 @@ int main(int argc, char** argv)
TRACE_ERR(weights);
TRACE_ERR("\n");
}
- if (staticData.GetOutputSearchGraphHypergraph()) {
- ofstream* weightsOut = new std::ofstream;
- stringstream weightsFilename;
- if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) {
- weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3];
- } else {
- string nbestFile = staticData.GetNBestFilePath();
- if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
- boost::filesystem::path nbestPath(nbestFile);
- weightsFilename << nbestPath.parent_path().filename() << "/weights";
- } else {
- weightsFilename << boost::filesystem::current_path().string() << "/hypergraph/weights";
- }
- }
- boost::filesystem::path weightsFilePath(weightsFilename.str());
- if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) {
- boost::filesystem::create_directory(weightsFilePath.parent_path());
- }
- TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
- weightsOut->open(weightsFilename.str().c_str());
- OutputFeatureWeightsForHypergraph(*weightsOut);
- weightsOut->flush();
- weightsOut->close();
- delete weightsOut;
- }
-
-
- // initialize output streams
- // note: we can't just write to STDOUT or files
- // because multithreading may return sentences in shuffled order
- auto_ptr<OutputCollector> outputCollector; // for translations
- auto_ptr<OutputCollector> nbestCollector; // for n-best lists
- auto_ptr<OutputCollector> latticeSamplesCollector; //for lattice samples
- auto_ptr<ofstream> nbestOut;
- auto_ptr<ofstream> latticeSamplesOut;
- size_t nbestSize = staticData.GetNBestSize();
- string nbestFile = staticData.GetNBestFilePath();
- bool output1best = true;
- if (nbestSize) {
- if (nbestFile == "-" || nbestFile == "/dev/stdout") {
- // nbest to stdout, no 1-best
- nbestCollector.reset(new OutputCollector());
- output1best = false;
- } else {
- // nbest to file, 1-best to stdout
- nbestOut.reset(new ofstream(nbestFile.c_str()));
- if (!nbestOut->good()) {
- TRACE_ERR("ERROR: Failed to open " << nbestFile << " for nbest lists" << endl);
- exit(1);
- }
- nbestCollector.reset(new OutputCollector(nbestOut.get()));
- }
- }
- size_t latticeSamplesSize = staticData.GetLatticeSamplesSize();
- string latticeSamplesFile = staticData.GetLatticeSamplesFilePath();
- if (latticeSamplesSize) {
- if (latticeSamplesFile == "-" || latticeSamplesFile == "/dev/stdout") {
- latticeSamplesCollector.reset(new OutputCollector());
- output1best = false;
- } else {
- latticeSamplesOut.reset(new ofstream(latticeSamplesFile.c_str()));
- if (!latticeSamplesOut->good()) {
- TRACE_ERR("ERROR: Failed to open " << latticeSamplesFile << " for lattice samples" << endl);
- exit(1);
- }
- latticeSamplesCollector.reset(new OutputCollector(latticeSamplesOut.get()));
- }
- }
- if (output1best) {
- outputCollector.reset(new OutputCollector());
- }
-
- // initialize stream for word graph (aka: output lattice)
- auto_ptr<OutputCollector> wordGraphCollector;
- if (staticData.GetOutputWordGraph()) {
- wordGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputWordGraphStream())));
- }
-
- // initialize stream for search graph
- // note: this is essentially the same as above, but in a different format
- auto_ptr<OutputCollector> searchGraphCollector;
- if (staticData.GetOutputSearchGraph()) {
- searchGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputSearchGraphStream())));
- }
-
- // initialize stram for details about the decoder run
- auto_ptr<OutputCollector> detailedTranslationCollector;
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- detailedTranslationCollector.reset(new OutputCollector(&(ioWrapper->GetDetailedTranslationReportingStream())));
- }
- // initialize stram for word alignment between input and output
- auto_ptr<OutputCollector> alignmentInfoCollector;
- if (!staticData.GetAlignmentOutputFile().empty()) {
- alignmentInfoCollector.reset(new OutputCollector(ioWrapper->GetAlignmentOutputStream()));
- }
+ boost::shared_ptr<HypergraphOutput<Manager> > hypergraphOutput;
+ boost::shared_ptr<HypergraphOutput<ChartManager> > hypergraphOutputChart;
- //initialise stream for unknown (oov) words
- auto_ptr<OutputCollector> unknownsCollector;
- auto_ptr<ofstream> unknownsStream;
- if (!staticData.GetOutputUnknownsFile().empty()) {
- unknownsStream.reset(new ofstream(staticData.GetOutputUnknownsFile().c_str()));
- if (!unknownsStream->good()) {
- TRACE_ERR("Unable to open " << staticData.GetOutputUnknownsFile() << " for unknowns");
- exit(1);
- }
- unknownsCollector.reset(new OutputCollector(unknownsStream.get()));
+ if (staticData.GetOutputSearchGraphHypergraph()) {
+ if (staticData.IsChart()) {
+ hypergraphOutputChart.reset(new HypergraphOutput<ChartManager>(PRECISION));
+ }
+ else {
+ hypergraphOutput.reset(new HypergraphOutput<Manager>(PRECISION));
+ }
}
#ifdef WITH_THREADS
@@ -784,45 +203,54 @@ int main(int argc, char** argv)
// main loop over set of input sentences
InputType* source = NULL;
size_t lineCount = staticData.GetStartTranslationId();
- while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+ while(ioWrapper->ReadInput(staticData.GetInputType(),source)) {
+ source->SetTranslationId(lineCount);
IFVERBOSE(1) {
ResetUserTime();
}
+
+ FeatureFunction::CallChangeSource(source);
+
// set up task of translating one sentence
- TranslationTask* task =
- new TranslationTask(lineCount,source, outputCollector.get(),
- nbestCollector.get(),
- latticeSamplesCollector.get(),
- wordGraphCollector.get(),
- searchGraphCollector.get(),
- detailedTranslationCollector.get(),
- alignmentInfoCollector.get(),
- unknownsCollector.get(),
- staticData.GetOutputSearchGraphSLF(),
- staticData.GetOutputSearchGraphHypergraph());
+ TranslationTask* task;
+ if (staticData.IsChart()) {
+ // scfg
+ task = new TranslationTask(source, *ioWrapper, hypergraphOutputChart);
+ }
+ else {
+ // pb
+ task = new TranslationTask(source, *ioWrapper,
+ staticData.GetOutputSearchGraphSLF(),
+ hypergraphOutput);
+ }
+
// execute task
#ifdef WITH_THREADS
+#ifdef PT_UG
if (my_acnt)
- {
- task->Run();
- delete task;
- string src,trg,aln;
- UTIL_THROW_IF2(!getline(spe_src,src), "[" << HERE << "] "
- << "missing update data for simulated post-editing.");
- UTIL_THROW_IF2(!getline(spe_trg,trg), "[" << HERE << "] "
- << "missing update data for simulated post-editing.");
- UTIL_THROW_IF2(!getline(spe_aln,aln), "[" << HERE << "] "
- << "missing update data for simulated post-editing.");
- BOOST_FOREACH (PhraseDictionary* pd, PhraseDictionary::GetColl())
- {
- Mmsapt* sapt = dynamic_cast<Mmsapt*>(pd);
- if (sapt) sapt->add(src,trg,aln);
- VERBOSE(1,"[" << HERE << " added src] " << src << endl);
- VERBOSE(1,"[" << HERE << " added trg] " << trg << endl);
- VERBOSE(1,"[" << HERE << " added aln] " << aln << endl);
- }
- }
- else pool.Submit(task);
+ {
+ // simulated post-editing: always run single-threaded!
+ task->Run();
+ delete task;
+ string src,trg,aln;
+ UTIL_THROW_IF2(!getline(spe_src,src), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(spe_trg,trg), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(spe_aln,aln), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ BOOST_FOREACH (PhraseDictionary* pd, PhraseDictionary::GetColl())
+ {
+ Mmsapt* sapt = dynamic_cast<Mmsapt*>(pd);
+ if (sapt) sapt->add(src,trg,aln);
+ VERBOSE(1,"[" << HERE << " added src] " << src << endl);
+ VERBOSE(1,"[" << HERE << " added trg] " << trg << endl);
+ VERBOSE(1,"[" << HERE << " added aln] " << aln << endl);
+ }
+ }
+ else
+#endif
+ pool.Submit(task);
#else
task->Run();
delete task;
diff --git a/moses/AlignmentInfo.cpp b/moses/AlignmentInfo.cpp
index ed317a764..b059a9ffd 100644
--- a/moses/AlignmentInfo.cpp
+++ b/moses/AlignmentInfo.cpp
@@ -25,13 +25,22 @@
namespace Moses
{
+
AlignmentInfo::AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs)
: m_collection(pairs)
{
- BuildNonTermIndexMap();
+ BuildNonTermIndexMaps();
+}
+
+AlignmentInfo::AlignmentInfo(const std::vector<unsigned char> &aln)
+{
+ assert(aln.size()%2==0);
+ for (size_t i = 0; i < aln.size(); i+= 2)
+ m_collection.insert(std::make_pair(size_t(aln[i]),size_t(aln[i+1])));
+ BuildNonTermIndexMaps();
}
-void AlignmentInfo::BuildNonTermIndexMap()
+void AlignmentInfo::BuildNonTermIndexMaps()
{
if (m_collection.empty()) {
return;
@@ -44,14 +53,17 @@ void AlignmentInfo::BuildNonTermIndexMap()
}
}
m_nonTermIndexMap.resize(maxIndex+1, NOT_FOUND);
+ m_nonTermIndexMap2.resize(maxIndex+1, NOT_FOUND);
size_t i = 0;
for (p = begin(); p != end(); ++p) {
if (m_nonTermIndexMap[p->second] != NOT_FOUND) {
// 1-to-many. Definitely a set of terminals. Don't bother storing 1-to-1 index map
m_nonTermIndexMap.clear();
+ m_nonTermIndexMap2.clear();
return;
}
m_nonTermIndexMap[p->second] = i++;
+ m_nonTermIndexMap2[p->second] = p->first;
}
}
diff --git a/moses/AlignmentInfo.h b/moses/AlignmentInfo.h
index 4e9647e3a..895dde8a1 100644
--- a/moses/AlignmentInfo.h
+++ b/moses/AlignmentInfo.h
@@ -65,6 +65,12 @@ public:
return m_nonTermIndexMap;
}
+ /** Like GetNonTermIndexMap but the return value is the symbol index (i.e.
+ * the index counting both terminals and non-terminals) */
+ const NonTermIndexMap &GetNonTermIndexMap2() const {
+ return m_nonTermIndexMap2;
+ }
+
const CollType &GetAlignments() const {
return m_collection;
}
@@ -88,11 +94,12 @@ public:
private:
//! AlignmentInfo objects should only be created by an AlignmentInfoCollection
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs);
-
- void BuildNonTermIndexMap();
+ explicit AlignmentInfo(const std::vector<unsigned char> &aln);
+ void BuildNonTermIndexMaps();
CollType m_collection;
NonTermIndexMap m_nonTermIndexMap;
+ NonTermIndexMap m_nonTermIndexMap2;
};
/** Define an arbitrary strict weak ordering between AlignmentInfo objects
diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp
index ef6e62eb3..0a54226cd 100644
--- a/moses/AlignmentInfoCollection.cpp
+++ b/moses/AlignmentInfoCollection.cpp
@@ -38,23 +38,23 @@ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
return *m_emptyAlignmentInfo;
}
-const AlignmentInfo *AlignmentInfoCollection::Add(
- const std::set<std::pair<size_t,size_t> > &pairs)
+AlignmentInfo const *
+AlignmentInfoCollection::
+Add(AlignmentInfo const& ainfo)
{
- AlignmentInfo pairsAlignmentInfo(pairs);
#ifdef WITH_THREADS
{
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
- AlignmentInfoSet::const_iterator i = m_collection.find(pairsAlignmentInfo);
+ AlignmentInfoSet::const_iterator i = m_collection.find(ainfo);
if (i != m_collection.end())
return &*i;
}
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
- std::pair<AlignmentInfoSet::iterator, bool> ret =
- m_collection.insert(pairsAlignmentInfo);
+ std::pair<AlignmentInfoSet::iterator, bool> ret = m_collection.insert(ainfo);
return &(*ret.first);
}
+
}
diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h
index 37d717b0f..1db0a2268 100644
--- a/moses/AlignmentInfoCollection.h
+++ b/moses/AlignmentInfoCollection.h
@@ -46,7 +46,16 @@ public:
* contains such an object then returns a pointer to it; otherwise a new
* one is inserted.
*/
- const AlignmentInfo *Add(const std::set<std::pair<size_t,size_t> > &);
+ private:
+ const AlignmentInfo* Add(AlignmentInfo const& ainfo);
+
+ public:
+ template<typename ALNREP>
+ AlignmentInfo const *
+ Add(ALNREP const & aln)
+ {
+ return this->Add(AlignmentInfo(aln));
+ }
//! Returns a pointer to an empty AlignmentInfo object.
const AlignmentInfo &GetEmptyAlignmentInfo() const;
@@ -54,6 +63,7 @@ public:
private:
typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
+
//! Only a single static variable should be created.
AlignmentInfoCollection();
~AlignmentInfoCollection();
diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp
index c7abc697d..56bc8529d 100644
--- a/moses/ChartManager.cpp
+++ b/moses/ChartManager.cpp
@@ -42,12 +42,11 @@ extern bool g_mosesDebug;
* \param source the sentence to be decoded
* \param system which particular set of models to use.
*/
-ChartManager::ChartManager(size_t lineNumber,InputType const& source)
+ChartManager::ChartManager(InputType const& source)
:m_source(source)
,m_hypoStackColl(source, *this)
,m_start(clock())
,m_hypothesisId(0)
- ,m_lineNumber(lineNumber)
,m_parser(source, m_hypoStackColl)
,m_translationOptionList(StaticData::Instance().GetRuleLimit(), source)
{
@@ -294,7 +293,7 @@ void ChartManager::OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraph
}
void ChartManager::OutputSearchGraphMoses(std::ostream &outputSearchGraphStream) const {
- ChartSearchGraphWriterMoses writer(&outputSearchGraphStream, m_lineNumber);
+ ChartSearchGraphWriterMoses writer(&outputSearchGraphStream, m_source.GetTranslationId());
WriteSearchGraph(writer);
}
diff --git a/moses/ChartManager.h b/moses/ChartManager.h
index b39b078bd..9ad4f4b85 100644
--- a/moses/ChartManager.h
+++ b/moses/ChartManager.h
@@ -50,7 +50,6 @@ private:
std::auto_ptr<SentenceStats> m_sentenceStats;
clock_t m_start; /**< starting time, used for logging */
unsigned m_hypothesisId; /* For handing out hypothesis ids to ChartHypothesis */
- size_t m_lineNumber;
ChartParser m_parser;
@@ -62,7 +61,7 @@ private:
void WriteSearchGraph(const ChartSearchGraphWriter& writer) const;
public:
- ChartManager(size_t lineNumber, InputType const& source);
+ ChartManager(InputType const& source);
~ChartManager();
void ProcessSentence();
void AddXmlChartOptions();
@@ -109,9 +108,6 @@ public:
const ChartParser &GetParser() const { return m_parser; }
- size_t GetLineNumber() const {
- return m_lineNumber;
- }
};
}
diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp
index 5c1e82184..a79e1bc68 100644
--- a/moses/ChartParser.cpp
+++ b/moses/ChartParser.cpp
@@ -100,10 +100,9 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
targetPhrase->EvaluateInIsolation(*unksrc);
-
targetPhrase->SetTargetLHS(targetLHS);
targetPhrase->SetAlignmentInfo("0-0");
- if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.GetTreeStructure() != NULL) {
+ if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.PrintNBestTrees() || staticData.GetTreeStructure() != NULL) {
targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]");
}
diff --git a/moses/FF/DecodeFeature.cpp b/moses/FF/DecodeFeature.cpp
index 2f4ae43af..6581c7ab0 100644
--- a/moses/FF/DecodeFeature.cpp
+++ b/moses/FF/DecodeFeature.cpp
@@ -50,8 +50,8 @@ DecodeFeature::DecodeFeature(size_t numScoreComponents
, const std::vector<FactorType> &output
, const std::string &line)
: StatelessFeatureFunction(numScoreComponents, line)
- , m_container(NULL)
, m_input(input), m_output(output)
+ , m_container(NULL)
{
m_inputFactors = FactorMask(input);
m_outputFactors = FactorMask(output);
diff --git a/moses/FF/DecodeFeature.h b/moses/FF/DecodeFeature.h
index ac4e9392b..a93eb9ba3 100644
--- a/moses/FF/DecodeFeature.h
+++ b/moses/FF/DecodeFeature.h
@@ -68,6 +68,9 @@ public:
void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
+ void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
+ ScoreComponentCollection* accumulator) const
+ {}
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
diff --git a/moses/FF/ExternalFeature.h b/moses/FF/ExternalFeature.h
index a8916a853..888fef951 100644
--- a/moses/FF/ExternalFeature.h
+++ b/moses/FF/ExternalFeature.h
@@ -87,6 +87,7 @@ protected:
class CdecFF
{
public:
+ virtual ~CdecFF() {}
virtual int StateSize() const = 0;
};
diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp
index 7cb86d104..6e402e31d 100644
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@@ -42,6 +42,7 @@
#include "moses/FF/SourceGHKMTreeInputMatchFeature.h"
#include "moses/FF/HyperParameterAsWeight.h"
#include "moses/FF/SetSourcePhrase.h"
+#include "moses/FF/PhraseOrientationFeature.h"
#include "CountNonTerms.h"
#include "ReferenceComparison.h"
#include "RuleScope.h"
@@ -49,13 +50,14 @@
#include "NieceTerminal.h"
#include "SpanLength.h"
#include "SyntaxRHS.h"
-#include "moses/FF/PhraseOrientationFeature.h"
#include "moses/FF/SkeletonStatelessFF.h"
#include "moses/FF/SkeletonStatefulFF.h"
#include "moses/LM/SkeletonLM.h"
+#include "moses/LM/BilingualLM.h"
#include "SkeletonChangeInput.h"
#include "moses/TranslationModel/SkeletonPT.h"
+#include "moses/Syntax/RuleTableFF.h"
#ifdef HAVE_CMPH
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
@@ -90,14 +92,16 @@
#ifdef LM_NEURAL
#include "moses/LM/NeuralLMWrapper.h"
+#include "moses/LM/bilingual-lm/BiLM_NPLM.h"
#endif
#ifdef LM_DALM
#include "moses/LM/DALMWrapper.h"
#endif
-#ifdef LM_LBL
-#include "moses/LM/oxlm/LBLLM.h"
+#ifdef LM_OXLM
+#include "moses/LM/oxlm/OxLM.h"
+#include "moses/LM/oxlm/SourceOxLM.h"
#endif
#include "util/exception.hh"
@@ -174,6 +178,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(PhraseDictionaryDynSuffixArray);
MOSES_FNAME(PhraseDictionaryTransliteration);
MOSES_FNAME(PhraseDictionaryFuzzyMatch);
+ MOSES_FNAME2("RuleTable", Syntax::RuleTableFF);
MOSES_FNAME(GlobalLexicalModel);
//MOSES_FNAME(GlobalLexicalModelUnlimited); This was commented out in the original
@@ -248,14 +253,16 @@ FeatureRegistry::FeatureRegistry()
#endif
#ifdef LM_NEURAL
MOSES_FNAME2("NeuralLM", NeuralLMWrapper);
+ MOSES_FNAME2("BilingualNPLM", BilingualLM_NPLM);
#endif
#ifdef LM_DALM
MOSES_FNAME2("DALM", LanguageModelDALM);
#endif
-#ifdef LM_LBL
- MOSES_FNAME2("LBLLM-LM", LBLLM<oxlm::LM>);
- MOSES_FNAME2("LBLLM-FactoredLM", LBLLM<oxlm::FactoredLM>);
- MOSES_FNAME2("LBLLM-FactoredMaxentLM", LBLLM<oxlm::FactoredMaxentLM>);
+#ifdef LM_OXLM
+ MOSES_FNAME2("OxLM", OxLM<oxlm::LM>);
+ MOSES_FNAME2("OxFactoredLM", OxLM<oxlm::FactoredLM>);
+ MOSES_FNAME2("OxFactoredMaxentLM", OxLM<oxlm::FactoredMaxentLM>);
+ MOSES_FNAME2("OxSourceFactoredLM", SourceOxLM);
#endif
Add("KENLM", new KenFactory());
diff --git a/moses/FF/GlobalLexicalModel.cpp b/moses/FF/GlobalLexicalModel.cpp
index f6eb165a8..5c603bc51 100644
--- a/moses/FF/GlobalLexicalModel.cpp
+++ b/moses/FF/GlobalLexicalModel.cpp
@@ -165,12 +165,12 @@ float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetP
return score;
}
-void GlobalLexicalModel::EvaluateWhenApplied
-(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
+void GlobalLexicalModel::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
- accumulator->PlusEquals( this,
- GetFromCacheOrScorePhrase(hypo.GetCurrTargetPhrase()) );
+ scoreBreakdown.PlusEquals( this, GetFromCacheOrScorePhrase(targetPhrase) );
}
bool GlobalLexicalModel::IsUseable(const FactorMask &mask) const
diff --git a/moses/FF/GlobalLexicalModel.h b/moses/FF/GlobalLexicalModel.h
index 151dbf472..65b5cf2b8 100644
--- a/moses/FF/GlobalLexicalModel.h
+++ b/moses/FF/GlobalLexicalModel.h
@@ -70,15 +70,17 @@ public:
bool IsUseable(const FactorMask &mask) const;
- void EvaluateWhenApplied(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const;
-
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
- void EvaluateWhenApplied(
- const ChartHypothesis& hypo,
- ScoreComponentCollection* accumulator) const {
- throw std::logic_error("GlobalLexicalModel not supported in chart decoder, yet");
- }
+ void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
@@ -87,12 +89,6 @@ public:
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void EvaluateInIsolation(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
- {}
-
};
diff --git a/moses/FF/InternalTree.cpp b/moses/FF/InternalTree.cpp
index b44fd2a65..2537cc50f 100644
--- a/moses/FF/InternalTree.cpp
+++ b/moses/FF/InternalTree.cpp
@@ -45,7 +45,7 @@ size_t InternalTree::AddSubTree(const std::string & line, size_t pos) {
}
}
else if (token == ' ' || token == ']') {
- if (value.size() > 0 && ! m_value.size() > 0) {
+ if (value.size() > 0 && !(m_value.size() > 0)) {
m_value = value;
}
else if (value.size() > 0) {
@@ -227,4 +227,4 @@ bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vec
return false;
}
-} \ No newline at end of file
+}
diff --git a/moses/FF/PhraseLengthFeature.h b/moses/FF/PhraseLengthFeature.h
index 4976e2210..9233aa3e7 100644
--- a/moses/FF/PhraseLengthFeature.h
+++ b/moses/FF/PhraseLengthFeature.h
@@ -29,9 +29,8 @@ public:
{}
void EvaluateWhenApplied(const ChartHypothesis& hypo,
- ScoreComponentCollection*) const {
- throw std::logic_error("PhraseLengthFeature not valid in chart decoder");
- }
+ ScoreComponentCollection*) const
+ {}
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
diff --git a/moses/FF/PhrasePairFeature.cpp b/moses/FF/PhrasePairFeature.cpp
index f359b68f7..6daab7e25 100644
--- a/moses/FF/PhrasePairFeature.cpp
+++ b/moses/FF/PhrasePairFeature.cpp
@@ -106,12 +106,14 @@ void PhrasePairFeature::Load()
}
}
-void PhrasePairFeature::EvaluateWhenApplied(
- const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
+void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
{
- const TargetPhrase& target = hypo.GetCurrTargetPhrase();
- const Phrase& source = hypo.GetTranslationOption().GetInputPath().GetPhrase();
+ const Phrase& source = inputPath.GetPhrase();
if (m_simple) {
ostringstream namestr;
namestr << "pp_";
@@ -122,17 +124,17 @@ void PhrasePairFeature::EvaluateWhenApplied(
namestr << sourceFactor->GetString();
}
namestr << "~";
- namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
- for (size_t i = 1; i < target.GetSize(); ++i) {
- const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
+ namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+ for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
+ const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
}
- accumulator->SparsePlusEquals(namestr.str(),1);
+ scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
if (m_domainTrigger) {
- const Sentence& input = static_cast<const Sentence&>(hypo.GetInput());
+ const Sentence& input = static_cast<const Sentence&>(input);
const bool use_topicid = input.GetUseTopicId();
const bool use_topicid_prob = input.GetUseTopicIdAndProb();
@@ -145,9 +147,9 @@ void PhrasePairFeature::EvaluateWhenApplied(
pair << sourceFactor->GetString();
}
pair << "~";
- pair << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
- for (size_t i = 1; i < target.GetSize(); ++i) {
- const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
+ pair << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+ for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
+ const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
pair << ",";
pair << targetFactor->GetString();
}
@@ -165,7 +167,7 @@ void PhrasePairFeature::EvaluateWhenApplied(
feature << "_";
feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
// use topic probabilities
const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
@@ -173,7 +175,7 @@ void PhrasePairFeature::EvaluateWhenApplied(
stringstream feature;
feature << "pp_unk_";
feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
stringstream feature;
@@ -181,7 +183,7 @@ void PhrasePairFeature::EvaluateWhenApplied(
feature << topicid_prob[i];
feature << "_";
feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
+ scoreBreakdown.SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
}
}
}
@@ -195,12 +197,12 @@ void PhrasePairFeature::EvaluateWhenApplied(
namestr << sourceTrigger;
namestr << "_";
namestr << pair.str();
- accumulator->SparsePlusEquals(namestr.str(),1);
+ scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
}
}
if (m_sourceContext) {
- const Sentence& input = static_cast<const Sentence&>(hypo.GetInput());
+ const Sentence& input = static_cast<const Sentence&>(input);
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
@@ -229,14 +231,14 @@ void PhrasePairFeature::EvaluateWhenApplied(
namestr << sourceFactor->GetString();
}
namestr << "~";
- namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
- for (size_t i = 1; i < target.GetSize(); ++i) {
- const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
+ namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+ for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
+ const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
}
- accumulator->SparsePlusEquals(namestr.str(),1);
+ scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
}
}
diff --git a/moses/FF/PhrasePairFeature.h b/moses/FF/PhrasePairFeature.h
index 8bfac628d..b0f380d0a 100644
--- a/moses/FF/PhrasePairFeature.h
+++ b/moses/FF/PhrasePairFeature.h
@@ -35,31 +35,32 @@ class PhrasePairFeature: public StatelessFeatureFunction
public:
PhrasePairFeature(const std::string &line);
- bool IsUseable(const FactorMask &mask) const;
+ void Load();
+ void SetParameter(const std::string& key, const std::string& value);
- void EvaluateWhenApplied(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const;
+ bool IsUseable(const FactorMask &mask) const;
- void EvaluateWhenApplied(const ChartHypothesis& hypo,
- ScoreComponentCollection*) const {
- throw std::logic_error("PhrasePairFeature not valid in chart decoder");
- }
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+ {}
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore = NULL) const
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+ void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
{}
- void EvaluateInIsolation(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+
+ void EvaluateWhenApplied(const ChartHypothesis& hypo,
+ ScoreComponentCollection*) const
{}
- void Load();
- void SetParameter(const std::string& key, const std::string& value);
};
diff --git a/moses/FF/PhrasePenalty.h b/moses/FF/PhrasePenalty.h
index b15a80224..80635b4e1 100644
--- a/moses/FF/PhrasePenalty.h
+++ b/moses/FF/PhrasePenalty.h
@@ -25,6 +25,9 @@ public:
void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
+ void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
+ ScoreComponentCollection* accumulator) const
+ {}
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
diff --git a/moses/FF/SourceWordDeletionFeature.cpp b/moses/FF/SourceWordDeletionFeature.cpp
index e5167b93b..dcbba6a0a 100644
--- a/moses/FF/SourceWordDeletionFeature.cpp
+++ b/moses/FF/SourceWordDeletionFeature.cpp
@@ -83,10 +83,7 @@ void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source,
if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
// flag aligned words
- bool aligned[16];
- UTIL_THROW_IF2(sourceLength >= 16, "Source length must be less than 16 words");
- for(size_t i=0; i<sourceLength; i++)
- aligned[i] = false;
+ std::vector<bool> aligned(sourceLength, false);
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++)
aligned[ alignmentPoint->first ] = true;
diff --git a/moses/FF/StatefulFeatureFunction.h b/moses/FF/StatefulFeatureFunction.h
index 86bed04ee..950b122e9 100644
--- a/moses/FF/StatefulFeatureFunction.h
+++ b/moses/FF/StatefulFeatureFunction.h
@@ -2,6 +2,8 @@
#include "FeatureFunction.h"
+#include "moses/Syntax/SHyperedge.h"
+
namespace Moses
{
class FFState;
@@ -39,6 +41,11 @@ public:
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const = 0;
+ virtual FFState* EvaluateWhenApplied(
+ const Syntax::SHyperedge& /* cur_hypo */,
+ int /* featureID - used to index the state in the previous hypotheses */,
+ ScoreComponentCollection* accumulator) const { assert(false); return 0; /* FIXME */ }
+
//! return the state associated with the empty hypothesis for a given sentence
virtual const FFState* EmptyHypothesisState(const InputType &input) const = 0;
diff --git a/moses/FF/StatelessFeatureFunction.h b/moses/FF/StatelessFeatureFunction.h
index 94029f882..abf029f0c 100644
--- a/moses/FF/StatelessFeatureFunction.h
+++ b/moses/FF/StatelessFeatureFunction.h
@@ -2,6 +2,8 @@
#include "FeatureFunction.h"
+#include "moses/Syntax/SHyperedge.h"
+
namespace Moses
{
@@ -32,6 +34,9 @@ public:
virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const = 0;
+ virtual void EvaluateWhenApplied(const Syntax::SHyperedge &,
+ ScoreComponentCollection*) const { assert(false); }
+
virtual bool IsStateless() const {
return true;
}
diff --git a/moses/FF/SyntaxRHS.cpp b/moses/FF/SyntaxRHS.cpp
index 5168b72d7..24b3bf062 100644
--- a/moses/FF/SyntaxRHS.cpp
+++ b/moses/FF/SyntaxRHS.cpp
@@ -42,13 +42,5 @@ void SyntaxRHS::EvaluateWithSourceContext(const InputType &input
}
-void SyntaxRHS::EvaluateWhenApplied(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
-{}
-
-void SyntaxRHS::EvaluateWhenApplied(const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const
-{}
-
}
diff --git a/moses/FF/SyntaxRHS.h b/moses/FF/SyntaxRHS.h
index 4b9214995..4413aef72 100644
--- a/moses/FF/SyntaxRHS.h
+++ b/moses/FF/SyntaxRHS.h
@@ -26,9 +26,12 @@ public:
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
void EvaluateWhenApplied(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const
+ {}
+
void EvaluateWhenApplied(const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const
+ {}
};
diff --git a/moses/FF/TargetWordInsertionFeature.cpp b/moses/FF/TargetWordInsertionFeature.cpp
index c8db6bfe3..09a7b4472 100644
--- a/moses/FF/TargetWordInsertionFeature.cpp
+++ b/moses/FF/TargetWordInsertionFeature.cpp
@@ -73,11 +73,7 @@ void TargetWordInsertionFeature::ComputeFeatures(const Phrase &source,
if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
// flag aligned words
- bool aligned[16];
- UTIL_THROW_IF2(targetLength >= 16, "Target length must be less than 16 words");
- for(size_t i=0; i<targetLength; i++) {
- aligned[i] = false;
- }
+ std::vector<bool> aligned(targetLength, false);
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++) {
aligned[ alignmentPoint->second ] = true;
}
diff --git a/moses/FF/UnknownWordPenaltyProducer.h b/moses/FF/UnknownWordPenaltyProducer.h
index 8850641e5..1aa6cbbcf 100644
--- a/moses/FF/UnknownWordPenaltyProducer.h
+++ b/moses/FF/UnknownWordPenaltyProducer.h
@@ -37,6 +37,9 @@ public:
void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
+ void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
+ ScoreComponentCollection* accumulator) const
+ {}
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
diff --git a/moses/FF/WordPenaltyProducer.h b/moses/FF/WordPenaltyProducer.h
index e62877307..19e331843 100644
--- a/moses/FF/WordPenaltyProducer.h
+++ b/moses/FF/WordPenaltyProducer.h
@@ -37,6 +37,9 @@ public:
void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
+ void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
+ ScoreComponentCollection* accumulator) const
+ {}
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
diff --git a/moses/FF/WordTranslationFeature.cpp b/moses/FF/WordTranslationFeature.cpp
index 7a98ad4c8..5111e677e 100644
--- a/moses/FF/WordTranslationFeature.cpp
+++ b/moses/FF/WordTranslationFeature.cpp
@@ -137,18 +137,19 @@ void WordTranslationFeature::Load()
}
}
-void WordTranslationFeature::EvaluateWhenApplied
-(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
+void WordTranslationFeature::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
{
- const Sentence& input = static_cast<const Sentence&>(hypo.GetInput());
- const TranslationOption& transOpt = hypo.GetTranslationOption();
- const TargetPhrase& targetPhrase = hypo.GetCurrTargetPhrase();
+ const Sentence& sentence = static_cast<const Sentence&>(input);
const AlignmentInfo &alignment = targetPhrase.GetAlignTerm();
// process aligned words
for (AlignmentInfo::const_iterator alignmentPoint = alignment.begin(); alignmentPoint != alignment.end(); alignmentPoint++) {
- const Phrase& sourcePhrase = transOpt.GetInputPath().GetPhrase();
+ const Phrase& sourcePhrase = inputPath.GetPhrase();
int sourceIndex = alignmentPoint->first;
int targetIndex = alignmentPoint->second;
Word ws = sourcePhrase.GetWord(sourceIndex);
@@ -183,15 +184,15 @@ void WordTranslationFeature::EvaluateWhenApplied
featureName << sourceWord;
featureName << "~";
featureName << targetWord;
- accumulator->SparsePlusEquals(featureName.str(), 1);
+ scoreBreakdown.SparsePlusEquals(featureName.str(), 1);
}
if (m_domainTrigger && !m_sourceContext) {
- const bool use_topicid = input.GetUseTopicId();
- const bool use_topicid_prob = input.GetUseTopicIdAndProb();
+ const bool use_topicid = sentence.GetUseTopicId();
+ const bool use_topicid_prob = sentence.GetUseTopicIdAndProb();
if (use_topicid || use_topicid_prob) {
if(use_topicid) {
// use topicid as trigger
- const long topicid = input.GetTopicId();
+ const long topicid = sentence.GetTopicId();
stringstream feature;
feature << m_description << "_";
if (topicid == -1)
@@ -203,7 +204,7 @@ void WordTranslationFeature::EvaluateWhenApplied
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
// use topic probabilities
const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
@@ -213,7 +214,7 @@ void WordTranslationFeature::EvaluateWhenApplied
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
stringstream feature;
@@ -223,7 +224,7 @@ void WordTranslationFeature::EvaluateWhenApplied
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
+ scoreBreakdown.SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
}
}
}
@@ -239,12 +240,12 @@ void WordTranslationFeature::EvaluateWhenApplied
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
}
}
}
if (m_sourceContext) {
- size_t globalSourceIndex = hypo.GetTranslationOption().GetStartPos() + sourceIndex;
+ size_t globalSourceIndex = inputPath.GetWordsRange().GetStartPos() + sourceIndex;
if (!m_domainTrigger && globalSourceIndex == 0) {
// add <s> trigger feature for source
stringstream feature;
@@ -253,7 +254,7 @@ void WordTranslationFeature::EvaluateWhenApplied
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
}
// range over source words to get context
@@ -284,7 +285,7 @@ void WordTranslationFeature::EvaluateWhenApplied
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
}
} else if (m_unrestricted || sourceTriggerExists) {
stringstream feature;
@@ -300,7 +301,7 @@ void WordTranslationFeature::EvaluateWhenApplied
}
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
}
}
}
@@ -349,13 +350,6 @@ void WordTranslationFeature::EvaluateWhenApplied
}
}
-void WordTranslationFeature::EvaluateWhenApplied(
- const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const
-{
- UTIL_THROW(util::Exception, "Need source phrase. Can't be arsed at the moment");
-}
-
bool WordTranslationFeature::IsUseable(const FactorMask &mask) const
{
bool ret = mask[m_factorTypeTarget];
diff --git a/moses/FF/WordTranslationFeature.h b/moses/FF/WordTranslationFeature.h
index c213d8eb3..63e3749c7 100644
--- a/moses/FF/WordTranslationFeature.h
+++ b/moses/FF/WordTranslationFeature.h
@@ -48,24 +48,27 @@ public:
return new DummyState();
}
- void EvaluateWhenApplied(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const;
-
- void EvaluateWhenApplied(const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const;
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore = NULL) const
- {}
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
+ void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
};
}
diff --git a/moses/HypergraphOutput.cpp b/moses/HypergraphOutput.cpp
index da7e804dc..5c689eaee 100644
--- a/moses/HypergraphOutput.cpp
+++ b/moses/HypergraphOutput.cpp
@@ -124,7 +124,7 @@ template<class M>
void HypergraphOutput<M>::Write(const M& manager) const {
stringstream fileName;
- fileName << m_hypergraphDir << "/" << manager.GetLineNumber();
+ fileName << m_hypergraphDir << "/" << manager.GetSource().GetTranslationId();
if ( m_appendSuffix ) {
fileName << "." << m_compression;
}
@@ -144,7 +144,7 @@ void HypergraphOutput<M>::Write(const M& manager) const {
manager.OutputSearchGraphAsHypergraph(file);
file.flush();
} else {
- TRACE_ERR("Cannot output hypergraph for line " << manager.GetLineNumber()
+ TRACE_ERR("Cannot output hypergraph for line " << manager.GetSource().GetTranslationId()
<< " because the output file " << fileName.str()
<< " is not open or not ready for writing"
<< std::endl);
diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses/IOWrapper.cpp
index 7f50fe246..1810184c6 100644
--- a/moses-chart-cmd/IOWrapper.cpp
+++ b/moses/IOWrapper.cpp
@@ -8,12 +8,12 @@ All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
+ * Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
+ * Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
+ * Neither the name of the University of Edinburgh nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
@@ -28,33 +28,44 @@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
+ ***********************************************************************/
// example file on how to use moses library
#include <iostream>
+#include <stack>
#include <boost/algorithm/string.hpp>
-#include "IOWrapper.h"
+
+#include "moses/Syntax/KBestExtractor.h"
+#include "moses/Syntax/SHyperedge.h"
+#include "moses/Syntax/S2T/DerivationWriter.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SVertex.h"
+
#include "moses/TypeDef.h"
#include "moses/Util.h"
+#include "moses/Hypothesis.h"
#include "moses/WordsRange.h"
+#include "moses/TrellisPathList.h"
#include "moses/StaticData.h"
-#include "moses/InputFileStream.h"
-#include "moses/Incremental.h"
-#include "moses/TranslationModel/PhraseDictionary.h"
-#include "moses/ChartTranslationOptions.h"
-#include "moses/ChartHypothesis.h"
#include "moses/FeatureVector.h"
+#include "moses/InputFileStream.h"
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/StatelessFeatureFunction.h"
-#include "moses/FF/TreeStructureFeature.h"
-#include "moses/PP/TreeStructurePhraseProperty.h"
+#include "moses/TreeInput.h"
+#include "moses/ConfusionNet.h"
+#include "moses/WordLattice.h"
+#include "moses/Incremental.h"
+#include "moses/ChartManager.h"
+
+
#include "util/exception.hh"
+#include "IOWrapper.h"
+
using namespace std;
-using namespace Moses;
-namespace MosesChartCmd
+namespace Moses
{
IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
@@ -66,45 +77,85 @@ IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
:m_inputFactorOrder(inputFactorOrder)
,m_outputFactorOrder(outputFactorOrder)
,m_inputFactorUsed(inputFactorUsed)
+ ,m_inputFilePath(inputFilePath)
+ ,m_nBestStream(NULL)
+
+ ,m_outputWordGraphStream(NULL)
,m_outputSearchGraphStream(NULL)
,m_detailedTranslationReportingStream(NULL)
- ,m_detailedTreeFragmentsTranslationReportingStream(NULL)
- ,m_alignmentInfoStream(NULL)
,m_unknownsStream(NULL)
- ,m_inputFilePath(inputFilePath)
- ,m_detailOutputCollector(NULL)
- ,m_detailTreeFragmentsOutputCollector(NULL)
- ,m_nBestOutputCollector(NULL)
- ,m_searchGraphOutputCollector(NULL)
+ ,m_alignmentInfoStream(NULL)
+ ,m_latticeSamplesStream(NULL)
+
,m_singleBestOutputCollector(NULL)
- ,m_alignmentInfoCollector(NULL)
+ ,m_nBestOutputCollector(NULL)
,m_unknownsCollector(NULL)
+ ,m_alignmentInfoCollector(NULL)
+ ,m_searchGraphOutputCollector(NULL)
+ ,m_detailedTranslationCollector(NULL)
+ ,m_wordGraphCollector(NULL)
+ ,m_latticeSamplesCollector(NULL)
+ ,m_detailTreeFragmentsOutputCollector(NULL)
+
+ ,m_surpressSingleBestOutput(false)
+
+ ,spe_src(NULL)
+ ,spe_trg(NULL)
+ ,spe_aln(NULL)
{
const StaticData &staticData = StaticData::Instance();
- if (m_inputFilePath.empty()) {
- m_inputStream = &std::cin;
- } else {
- m_inputStream = new InputFileStream(inputFilePath);
+ if (inputFilePath.empty()) {
+ m_inputFile = NULL;
+ m_inputStream = &cin;
+ }
+ else {
+ m_inputFile = new InputFileStream(inputFilePath);
+ m_inputStream = m_inputFile;
}
-
- bool suppressSingleBestOutput = false;
if (nBestSize > 0) {
- if (nBestFilePath == "-") {
+ if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") {
+ m_nBestStream = &std::cout;
m_nBestOutputCollector = new Moses::OutputCollector(&std::cout);
- suppressSingleBestOutput = true;
+ m_surpressSingleBestOutput = true;
} else {
- m_nBestOutputCollector = new Moses::OutputCollector(new std::ofstream(nBestFilePath.c_str()));
- m_nBestOutputCollector->HoldOutputStream();
+ std::ofstream *file = new std::ofstream;
+ file->open(nBestFilePath.c_str());
+ m_nBestStream = file;
+
+ m_nBestOutputCollector = new Moses::OutputCollector(file);
+ //m_nBestOutputCollector->HoldOutputStream();
}
}
- if (!suppressSingleBestOutput) {
- m_singleBestOutputCollector = new Moses::OutputCollector(&std::cout);
+ // search graph output
+ if (staticData.GetOutputSearchGraph()) {
+ string fileName;
+ if (staticData.GetOutputSearchGraphExtended())
+ fileName = staticData.GetParam("output-search-graph-extended")[0];
+ else
+ fileName = staticData.GetParam("output-search-graph")[0];
+ std::ofstream *file = new std::ofstream;
+ m_outputSearchGraphStream = file;
+ file->open(fileName.c_str());
+ }
+
+ if (!staticData.GetOutputUnknownsFile().empty()) {
+ m_unknownsStream = new std::ofstream(staticData.GetOutputUnknownsFile().c_str());
+ m_unknownsCollector = new Moses::OutputCollector(m_unknownsStream);
+ UTIL_THROW_IF2(!m_unknownsStream->good(),
+ "File for unknowns words could not be opened: " <<
+ staticData.GetOutputUnknownsFile());
+ }
+
+ if (!staticData.GetAlignmentOutputFile().empty()) {
+ m_alignmentInfoStream = new std::ofstream(staticData.GetAlignmentOutputFile().c_str());
+ m_alignmentInfoCollector = new Moses::OutputCollector(m_alignmentInfoStream);
+ UTIL_THROW_IF2(!m_alignmentInfoStream->good(),
+ "File for alignment output could not be opened: " << staticData.GetAlignmentOutputFile());
}
- // search graph output
if (staticData.GetOutputSearchGraph()) {
string fileName = staticData.GetParam("output-search-graph")[0];
std::ofstream *file = new std::ofstream;
@@ -117,7 +168,7 @@ IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
if (staticData.IsDetailedTranslationReportingEnabled()) {
const std::string &path = staticData.GetDetailedTranslationReportingFilePath();
m_detailedTranslationReportingStream = new std::ofstream(path.c_str());
- m_detailOutputCollector = new Moses::OutputCollector(m_detailedTranslationReportingStream);
+ m_detailedTranslationCollector = new Moses::OutputCollector(m_detailedTranslationReportingStream);
}
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
@@ -126,53 +177,74 @@ IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
m_detailTreeFragmentsOutputCollector = new Moses::OutputCollector(m_detailedTreeFragmentsTranslationReportingStream);
}
- if (!staticData.GetAlignmentOutputFile().empty()) {
- m_alignmentInfoStream = new std::ofstream(staticData.GetAlignmentOutputFile().c_str());
- m_alignmentInfoCollector = new Moses::OutputCollector(m_alignmentInfoStream);
- UTIL_THROW_IF2(!m_alignmentInfoStream->good(),
- "File for alignment output could not be opened: " << staticData.GetAlignmentOutputFile());
+ // wordgraph output
+ if (staticData.GetOutputWordGraph()) {
+ string fileName = staticData.GetParam("output-word-graph")[0];
+ std::ofstream *file = new std::ofstream;
+ m_outputWordGraphStream = file;
+ file->open(fileName.c_str());
+ m_wordGraphCollector = new OutputCollector(m_outputWordGraphStream);
}
- if (!staticData.GetOutputUnknownsFile().empty()) {
- m_unknownsStream = new std::ofstream(staticData.GetOutputUnknownsFile().c_str());
- m_unknownsCollector = new Moses::OutputCollector(m_unknownsStream);
- UTIL_THROW_IF2(!m_unknownsStream->good(),
- "File for unknowns words could not be opened: " <<
- staticData.GetOutputUnknownsFile());
+ size_t latticeSamplesSize = staticData.GetLatticeSamplesSize();
+ string latticeSamplesFile = staticData.GetLatticeSamplesFilePath();
+ if (latticeSamplesSize) {
+ if (latticeSamplesFile == "-" || latticeSamplesFile == "/dev/stdout") {
+ m_latticeSamplesCollector = new OutputCollector();
+ m_surpressSingleBestOutput = true;
+ } else {
+ m_latticeSamplesStream = new ofstream(latticeSamplesFile.c_str());
+ if (!m_latticeSamplesStream->good()) {
+ TRACE_ERR("ERROR: Failed to open " << latticeSamplesFile << " for lattice samples" << endl);
+ exit(1);
+ }
+ m_latticeSamplesCollector = new OutputCollector(m_latticeSamplesStream);
+ }
+ }
+
+ if (!m_surpressSingleBestOutput) {
+ m_singleBestOutputCollector = new Moses::OutputCollector(&std::cout);
+ }
+
+ if (staticData.GetParam("spe-src").size()) {
+ spe_src = new ifstream(staticData.GetParam("spe-src")[0].c_str());
+ spe_trg = new ifstream(staticData.GetParam("spe-trg")[0].c_str());
+ spe_aln = new ifstream(staticData.GetParam("spe-aln")[0].c_str());
}
}
IOWrapper::~IOWrapper()
{
- if (!m_inputFilePath.empty()) {
- delete m_inputStream;
+ if (m_inputFile != NULL)
+ delete m_inputFile;
+ if (m_nBestStream != NULL && !m_surpressSingleBestOutput) {
+ // outputting n-best to file, rather than stdout. need to close file and delete obj
+ delete m_nBestStream;
}
- delete m_outputSearchGraphStream;
+
delete m_detailedTranslationReportingStream;
- delete m_detailedTreeFragmentsTranslationReportingStream;
- delete m_detailTreeFragmentsOutputCollector;
delete m_alignmentInfoStream;
delete m_unknownsStream;
- delete m_detailOutputCollector;
- delete m_nBestOutputCollector;
- delete m_searchGraphOutputCollector;
+ delete m_outputSearchGraphStream;
+ delete m_outputWordGraphStream;
+ delete m_latticeSamplesStream;
+
delete m_singleBestOutputCollector;
+ delete m_nBestOutputCollector;
delete m_alignmentInfoCollector;
- delete m_unknownsCollector;
-}
+ delete m_searchGraphOutputCollector;
+ delete m_detailedTranslationCollector;
+ delete m_wordGraphCollector;
+ delete m_latticeSamplesCollector;
+ delete m_detailTreeFragmentsOutputCollector;
-void IOWrapper::ResetTranslationId()
-{
- m_translationId = StaticData::Instance().GetStartTranslationId();
}
-InputType*IOWrapper::GetInput(InputType* inputType)
+InputType*
+IOWrapper::
+GetInput(InputType* inputType)
{
if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
- if (long x = inputType->GetTranslationId()) {
- if (x>=m_translationId) m_translationId = x+1;
- } else inputType->SetTranslationId(m_translationId++);
-
return inputType;
} else {
delete inputType;
@@ -180,50 +252,106 @@ InputType*IOWrapper::GetInput(InputType* inputType)
}
}
-
-/***
- * print surface factor only for the given phrase
- */
-void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<FactorType> &outputFactorOrder, bool reportAllFactors)
+void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
{
- UTIL_THROW_IF2(outputFactorOrder.size() == 0,
- "Cannot be empty phrase");
- if (reportAllFactors == true) {
- out << phrase;
- } else {
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
- out << *factor;
- UTIL_THROW_IF2(factor == NULL,
- "Empty factor 0 at position " << pos);
-
- for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
- UTIL_THROW_IF2(factor == NULL,
- "Empty factor " << i << " at position " << pos);
+ stream.setf(std::ios::fixed);
+ stream.precision(size);
+}
- out << "|" << *factor;
- }
- out << " ";
+std::map<size_t, const Factor*> IOWrapper::GetPlaceholders(const Hypothesis &hypo, FactorType placeholderFactor)
+{
+ const InputPath &inputPath = hypo.GetTranslationOption().GetInputPath();
+ const Phrase &inputPhrase = inputPath.GetPhrase();
+
+ std::map<size_t, const Factor*> ret;
+
+ for (size_t sourcePos = 0; sourcePos < inputPhrase.GetSize(); ++sourcePos) {
+ const Factor *factor = inputPhrase.GetFactor(sourcePos, placeholderFactor);
+ if (factor) {
+ std::set<size_t> targetPos = hypo.GetTranslationOption().GetTargetPhrase().GetAlignTerm().GetAlignmentsForSource(sourcePos);
+ UTIL_THROW_IF2(targetPos.size() != 1,
+ "Placeholder should be aligned to 1, and only 1, word");
+ ret[*targetPos.begin()] = factor;
}
}
+
+ return ret;
}
-void OutputSurface(std::ostream &out, const ChartHypothesis *hypo, const std::vector<FactorType> &outputFactorOrder
- ,bool reportSegmentation, bool reportAllFactors)
+
+void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId)
{
- if ( hypo != NULL) {
- //OutputSurface(out, hypo->GetCurrTargetPhrase(), outputFactorOrder, reportAllFactors);
+ if (!m_singleBestOutputCollector)
+ return;
+ std::ostringstream out;
+ FixPrecision(out);
+ if (hypo != NULL) {
+ VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
+ VERBOSE(3,"Best path: ");
+ Backtrack(hypo);
+ VERBOSE(3,"0" << std::endl);
- const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << hypo->GetTotalScore() << " ";
+ }
- vector<const ChartHypothesis*>::const_iterator iter;
- for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
- const ChartHypothesis *prevHypo = *iter;
+ if (StaticData::Instance().IsPathRecoveryEnabled()) {
+ out << "||| ";
+ }
+ Phrase outPhrase(ARRAY_SIZE_INCR);
+ hypo->GetOutputPhrase(outPhrase);
+
+ // delete 1st & last
+ UTIL_THROW_IF2(outPhrase.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+
+ outPhrase.RemoveWord(0);
+ outPhrase.RemoveWord(outPhrase.GetSize() - 1);
+
+ const std::vector<FactorType> outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
+ string output = outPhrase.GetStringRep(outputFactorOrder);
+ out << output << endl;
+ } else {
+ VERBOSE(1, "NO BEST TRANSLATION" << endl);
- OutputSurface(out, prevHypo, outputFactorOrder, reportSegmentation, reportAllFactors);
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << "0 ";
}
+
+ out << endl;
+ }
+ m_singleBestOutputCollector->Write(translationId, out.str());
+}
+
+void IOWrapper::OutputBestHypo(search::Applied applied, long translationId)
+{
+ if (!m_singleBestOutputCollector) return;
+ std::ostringstream out;
+ FixPrecision(out);
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << applied.GetScore() << ' ';
+ }
+ Phrase outPhrase;
+ Incremental::ToPhrase(applied, outPhrase);
+ // delete 1st & last
+ UTIL_THROW_IF2(outPhrase.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+ outPhrase.RemoveWord(0);
+ outPhrase.RemoveWord(outPhrase.GetSize() - 1);
+ out << outPhrase.GetStringRep(StaticData::Instance().GetOutputFactorOrder());
+ out << '\n';
+ m_singleBestOutputCollector->Write(translationId, out.str());
+
+ VERBOSE(1,"BEST TRANSLATION: " << outPhrase << "[total=" << applied.GetScore() << "]" << endl);
+}
+
+void IOWrapper::OutputBestNone(long translationId)
+{
+ if (!m_singleBestOutputCollector) return;
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ m_singleBestOutputCollector->Write(translationId, "0 \n");
+ } else {
+ m_singleBestOutputCollector->Write(translationId, "\n");
}
}
@@ -240,35 +368,78 @@ void IOWrapper::Backtrack(const ChartHypothesis *hypo)
}
}
-void IOWrapper::OutputBestHypo(const std::vector<const Factor*>& mbrBestHypo, long /*translationId*/)
+void IOWrapper::OutputDetailedTranslationReport(
+ const search::Applied *applied,
+ const Sentence &sentence,
+ long translationId)
{
- for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
- const Factor *factor = mbrBestHypo[i];
- UTIL_THROW_IF(factor == NULL, util::Exception,
- "No factor at position " << i );
+ if (applied == NULL) {
+ return;
+ }
+ std::ostringstream out;
+ ApplicationContext applicationContext;
+
+ OutputTranslationOptions(out, applicationContext, applied, sentence, translationId);
+ UTIL_THROW_IF2(m_detailedTranslationCollector == NULL,
+ "No ouput file for detailed reports specified");
+ m_detailedTranslationCollector->Write(translationId, out.str());
+}
- cout << *factor << " ";
+void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
+{
+ if (hypo != NULL) {
+ OutputTranslationOption(out, applicationContext, hypo, sentence, translationId);
+ out << std::endl;
+ }
+
+ // recursive
+ const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+ std::vector<const ChartHypothesis*>::const_iterator iter;
+ for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
+ const ChartHypothesis *prevHypo = *iter;
+ OutputTranslationOptions(out, applicationContext, prevHypo, sentence, translationId);
}
}
-/*
-void OutputInput(std::vector<const Phrase*>& map, const ChartHypothesis* hypo)
+
+
+void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Sentence &sentence, long translationId)
{
- if (hypo->GetPrevHypos())
- {
- OutputInput(map, hypo->GetPrevHypos());
- map[hypo->GetCurrSourceWordsRange().GetStartPos()] = hypo->GetSourcePhrase();
- }
+ if (applied != NULL) {
+ OutputTranslationOption(out, applicationContext, applied, sentence, translationId);
+ out << std::endl;
+ }
+
+ // recursive
+ const search::Applied *child = applied->Children();
+ for (size_t i = 0; i < applied->GetArity(); i++) {
+ OutputTranslationOptions(out, applicationContext, child++, sentence, translationId);
+ }
}
-void OutputInput(std::ostream& os, const ChartHypothesis* hypo)
+void IOWrapper::OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
{
- size_t len = StaticData::Instance().GetInput()->GetSize();
- std::vector<const Phrase*> inp_phrases(len, 0);
- OutputInput(inp_phrases, hypo);
- for (size_t i=0; i<len; ++i)
- if (inp_phrases[i]) os << *inp_phrases[i];
+ ReconstructApplicationContext(*hypo, sentence, applicationContext);
+ out << "Trans Opt " << translationId
+ << " " << hypo->GetCurrSourceRange()
+ << ": ";
+ WriteApplicationContext(out, applicationContext);
+ out << ": " << hypo->GetCurrTargetPhrase().GetTargetLHS()
+ << "->" << hypo->GetCurrTargetPhrase()
+ << " " << hypo->GetTotalScore() << hypo->GetScoreBreakdown();
+}
+
+void IOWrapper::OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Sentence &sentence, long translationId)
+{
+ ReconstructApplicationContext(applied, sentence, applicationContext);
+ const TargetPhrase &phrase = *static_cast<const TargetPhrase*>(applied->GetNote().vp);
+ out << "Trans Opt " << translationId
+ << " " << applied->GetRange()
+ << ": ";
+ WriteApplicationContext(out, applicationContext);
+ out << ": " << phrase.GetTargetLHS()
+ << "->" << phrase
+ << " " << applied->GetScore(); // << hypo->GetScoreBreakdown() TODO: missing in incremental search hypothesis
}
-*/
// Given a hypothesis and sentence, reconstructs the 'application context' --
// the source RHS symbols of the SCFG rule that was applied, plus their spans.
@@ -299,7 +470,6 @@ void IOWrapper::ReconstructApplicationContext(const ChartHypothesis &hypo,
}
}
-
// Given a hypothesis and sentence, reconstructs the 'application context' --
// the source RHS symbols of the SCFG rule that was applied, plus their spans.
void IOWrapper::ReconstructApplicationContext(const search::Applied *applied,
@@ -330,7 +500,6 @@ void IOWrapper::ReconstructApplicationContext(const search::Applied *applied,
}
}
-
// Emulates the old operator<<(ostream &, const DottedRule &) function. The
// output format is a bit odd (reverse order and double spacing between symbols)
// but there are scripts and tools that expect the output of -T to look like
@@ -349,61 +518,58 @@ void IOWrapper::WriteApplicationContext(std::ostream &out,
}
}
-void IOWrapper::OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
-{
- ReconstructApplicationContext(*hypo, sentence, applicationContext);
- out << "Trans Opt " << translationId
- << " " << hypo->GetCurrSourceRange()
- << ": ";
- WriteApplicationContext(out, applicationContext);
- out << ": " << hypo->GetCurrTargetPhrase().GetTargetLHS()
- << "->" << hypo->GetCurrTargetPhrase()
- << " " << hypo->GetTotalScore() << hypo->GetScoreBreakdown();
-}
-
-void IOWrapper::OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Sentence &sentence, long translationId)
+void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
+ const ChartHypothesis *hypo,
+ const Sentence &sentence,
+ long translationId)
{
- ReconstructApplicationContext(applied, sentence, applicationContext);
- const TargetPhrase &phrase = *static_cast<const TargetPhrase*>(applied->GetNote().vp);
- out << "Trans Opt " << translationId
- << " " << applied->GetRange()
- << ": ";
- WriteApplicationContext(out, applicationContext);
- out << ": " << phrase.GetTargetLHS()
- << "->" << phrase
- << " " << applied->GetScore(); // << hypo->GetScoreBreakdown() TODO: missing in incremental search hypothesis
-}
+ if (hypo == NULL) {
+ return;
+ }
+ std::ostringstream out;
+ ApplicationContext applicationContext;
+ OutputTreeFragmentsTranslationOptions(out, applicationContext, hypo, sentence, translationId);
+ UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
+ "No output file for tree fragments specified");
-void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
-{
- if (hypo != NULL) {
- OutputTranslationOption(out, applicationContext, hypo, sentence, translationId);
- out << std::endl;
+ //Tree of full sentence
+ const StatefulFeatureFunction* treeStructure = StaticData::Instance().GetTreeStructure();
+ if (treeStructure != NULL) {
+ const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for( size_t i=0; i<sff.size(); i++ ) {
+ if (sff[i] == treeStructure) {
+ const TreeState* tree = dynamic_cast<const TreeState*>(hypo->GetFFState(i));
+ out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n";
+ break;
+ }
+ }
}
- // recursive
- const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
- std::vector<const ChartHypothesis*>::const_iterator iter;
- for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
- const ChartHypothesis *prevHypo = *iter;
- OutputTranslationOptions(out, applicationContext, prevHypo, sentence, translationId);
- }
-}
+ m_detailTreeFragmentsOutputCollector->Write(translationId, out.str());
+}
-void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Sentence &sentence, long translationId)
+void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
+ const search::Applied *applied,
+ const Sentence &sentence,
+ long translationId)
{
- if (applied != NULL) {
- OutputTranslationOption(out, applicationContext, applied, sentence, translationId);
- out << std::endl;
+ if (applied == NULL) {
+ return;
}
+ std::ostringstream out;
+ ApplicationContext applicationContext;
+
+ OutputTreeFragmentsTranslationOptions(out, applicationContext, applied, sentence, translationId);
+ UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
+ "No output file for tree fragments specified");
+
+ //Tree of full sentence
+ //TODO: incremental search doesn't support stateful features
+
+ m_detailTreeFragmentsOutputCollector->Write(translationId, out.str());
- // recursive
- const search::Applied *child = applied->Children();
- for (size_t i = 0; i < applied->GetArity(); i++) {
- OutputTranslationOptions(out, applicationContext, child++, sentence, translationId);
- }
}
void IOWrapper::OutputTreeFragmentsTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
@@ -456,92 +622,183 @@ void IOWrapper::OutputTreeFragmentsTranslationOptions(std::ostream &out, Applica
}
}
-void IOWrapper::OutputDetailedTranslationReport(
- const ChartHypothesis *hypo,
- const Sentence &sentence,
- long translationId)
+void IOWrapper::OutputNBestList(const std::vector<search::Applied> &nbest, long translationId)
{
- if (hypo == NULL) {
- return;
- }
std::ostringstream out;
- ApplicationContext applicationContext;
+ // wtf? copied from the original OutputNBestList
+ if (m_nBestOutputCollector->OutputIsCout()) {
+ FixPrecision(out);
+ }
+ Phrase outputPhrase;
+ ScoreComponentCollection features;
+ for (std::vector<search::Applied>::const_iterator i = nbest.begin(); i != nbest.end(); ++i) {
+ Incremental::PhraseAndFeatures(*i, outputPhrase, features);
+ // <s> and </s>
+ UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
- OutputTranslationOptions(out, applicationContext, hypo, sentence, translationId);
- UTIL_THROW_IF2(m_detailOutputCollector == NULL,
- "No ouput file for detailed reports specified");
- m_detailOutputCollector->Write(translationId, out.str());
+ outputPhrase.RemoveWord(0);
+ outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
+ out << translationId << " ||| ";
+ OutputSurface(out, outputPhrase, m_outputFactorOrder, false);
+ out << " ||| ";
+ OutputAllFeatureScores(features, out);
+ out << " ||| " << i->GetScore() << '\n';
+ }
+ out << std::flush;
+ assert(m_nBestOutputCollector);
+ m_nBestOutputCollector->Write(translationId, out.str());
}
-void IOWrapper::OutputDetailedTranslationReport(
- const search::Applied *applied,
- const Sentence &sentence,
- long translationId)
+/***
+ * print surface factor only for the given phrase
+ */
+void IOWrapper::OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<FactorType> &outputFactorOrder, bool reportAllFactors)
{
- if (applied == NULL) {
- return;
- }
- std::ostringstream out;
- ApplicationContext applicationContext;
+ UTIL_THROW_IF2(outputFactorOrder.size() == 0,
+ "Cannot be empty phrase");
+ if (reportAllFactors == true) {
+ out << phrase;
+ } else {
+ size_t size = phrase.GetSize();
+ for (size_t pos = 0 ; pos < size ; pos++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
+ out << *factor;
+ UTIL_THROW_IF2(factor == NULL,
+ "Empty factor 0 at position " << pos);
- OutputTranslationOptions(out, applicationContext, applied, sentence, translationId);
- UTIL_THROW_IF2(m_detailOutputCollector == NULL,
- "No ouput file for detailed reports specified");
- m_detailOutputCollector->Write(translationId, out.str());
+ for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
+ UTIL_THROW_IF2(factor == NULL,
+ "Empty factor " << i << " at position " << pos);
+
+ out << "|" << *factor;
+ }
+ out << " ";
+ }
+ }
}
-void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
- const ChartHypothesis *hypo,
- const Sentence &sentence,
- long translationId)
+void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothesis *hypo)
{
- if (hypo == NULL) {
- return;
+ ostringstream out;
+
+ if (hypo) {
+ Alignments retAlign;
+ OutputAlignment(retAlign, hypo, 0);
+
+ // output alignments
+ Alignments::const_iterator iter;
+ for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
+ const pair<size_t, size_t> &alignPoint = *iter;
+ out << alignPoint.first << "-" << alignPoint.second << " ";
+ }
}
- std::ostringstream out;
- ApplicationContext applicationContext;
+ out << endl;
- OutputTreeFragmentsTranslationOptions(out, applicationContext, hypo, sentence, translationId);
- UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
- "No output file for tree fragments specified");
+ m_alignmentInfoCollector->Write(translationId, out.str());
+}
- //Tree of full sentence
- const StatefulFeatureFunction* treeStructure = StaticData::Instance().GetTreeStructure();
- if (treeStructure != NULL) {
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for( size_t i=0; i<sff.size(); i++ ) {
- if (sff[i] == treeStructure) {
- const TreeState* tree = dynamic_cast<const TreeState*>(hypo->GetFFState(i));
- out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n";
- break;
- }
+size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget)
+{
+ size_t totalTargetSize = 0;
+ size_t startSource = hypo->GetCurrSourceRange().GetStartPos();
+
+ const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
+
+ size_t thisSourceSize = CalcSourceSize(hypo);
+
+ // position of each terminal word in translation rule, irrespective of alignment
+ // if non-term, number is undefined
+ vector<size_t> sourceOffsets(thisSourceSize, 0);
+ vector<size_t> targetOffsets(tp.GetSize(), 0);
+
+ const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+
+ const AlignmentInfo &aiNonTerm = hypo->GetCurrTargetPhrase().GetAlignNonTerm();
+ vector<size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
+ const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap();
+
+ UTIL_THROW_IF2(sourceInd2pos.size() != prevHypos.size(), "Error");
+
+ size_t targetInd = 0;
+ for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
+ if (tp.GetWord(targetPos).IsNonTerminal()) {
+ UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
+ size_t sourceInd = targetPos2SourceInd[targetPos];
+ size_t sourcePos = sourceInd2pos[sourceInd];
+
+ const ChartHypothesis *prevHypo = prevHypos[sourceInd];
+
+ // calc source size
+ size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
+ sourceOffsets[sourcePos] = sourceSize;
+
+ // calc target size.
+ // Recursively look thru child hypos
+ size_t currStartTarget = startTarget + totalTargetSize;
+ size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
+ targetOffsets[targetPos] = targetSize;
+
+ totalTargetSize += targetSize;
+ ++targetInd;
+ } else {
+ ++totalTargetSize;
}
}
- m_detailTreeFragmentsOutputCollector->Write(translationId, out.str());
+ // convert position within translation rule to absolute position within
+ // source sentence / output sentence
+ ShiftOffsets(sourceOffsets, startSource);
+ ShiftOffsets(targetOffsets, startTarget);
+ // get alignments from this hypo
+ const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
+
+ // add to output arg, offsetting by source & target
+ AlignmentInfo::const_iterator iter;
+ for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+ const std::pair<size_t,size_t> &align = *iter;
+ size_t relSource = align.first;
+ size_t relTarget = align.second;
+ size_t absSource = sourceOffsets[relSource];
+ size_t absTarget = targetOffsets[relTarget];
+
+ pair<size_t, size_t> alignPoint(absSource, absTarget);
+ pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+ UTIL_THROW_IF2(!ret.second, "Error");
+
+ }
+
+ return totalTargetSize;
}
-void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
- const search::Applied *applied,
+size_t IOWrapper::CalcSourceSize(const Moses::ChartHypothesis *hypo)
+{
+ size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
+ const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+ for (size_t i = 0; i < prevHypos.size(); ++i) {
+ size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
+ ret -= (childSize - 1);
+ }
+ return ret;
+}
+
+void IOWrapper::OutputDetailedTranslationReport(
+ const ChartHypothesis *hypo,
const Sentence &sentence,
long translationId)
{
- if (applied == NULL) {
+ if (hypo == NULL) {
return;
}
std::ostringstream out;
ApplicationContext applicationContext;
- OutputTreeFragmentsTranslationOptions(out, applicationContext, applied, sentence, translationId);
- UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
- "No output file for tree fragments specified");
-
- //Tree of full sentence
- //TODO: incremental search doesn't support stateful features
-
- m_detailTreeFragmentsOutputCollector->Write(translationId, out.str());
-
+ OutputTranslationOptions(out, applicationContext, hypo, sentence, translationId);
+ UTIL_THROW_IF2(m_detailedTranslationCollector == NULL,
+ "No ouput file for detailed reports specified");
+ m_detailedTranslationCollector->Write(translationId, out.str());
}
//DIMw
@@ -573,129 +830,20 @@ void IOWrapper::OutputDetailedAllTranslationReport(
}
}
}
- UTIL_THROW_IF2(m_detailAllOutputCollector == NULL,
+ UTIL_THROW_IF2(m_detailedTranslationCollector == NULL,
"No output file for details specified");
- m_detailAllOutputCollector->Write(translationId, out.str());
-}
-
-void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId)
-{
- if (!m_singleBestOutputCollector)
- return;
- std::ostringstream out;
- IOWrapper::FixPrecision(out);
- if (hypo != NULL) {
- VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
- VERBOSE(3,"Best path: ");
- Backtrack(hypo);
- VERBOSE(3,"0" << std::endl);
-
- if (StaticData::Instance().GetOutputHypoScore()) {
- out << hypo->GetTotalScore() << " ";
- }
-
- if (StaticData::Instance().IsPathRecoveryEnabled()) {
- out << "||| ";
- }
- Phrase outPhrase(ARRAY_SIZE_INCR);
- hypo->GetOutputPhrase(outPhrase);
-
- // delete 1st & last
- UTIL_THROW_IF2(outPhrase.GetSize() < 2,
- "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
-
- outPhrase.RemoveWord(0);
- outPhrase.RemoveWord(outPhrase.GetSize() - 1);
-
- const std::vector<FactorType> outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
- string output = outPhrase.GetStringRep(outputFactorOrder);
- out << output << endl;
- } else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
-
- if (StaticData::Instance().GetOutputHypoScore()) {
- out << "0 ";
- }
-
- out << endl;
- }
- m_singleBestOutputCollector->Write(translationId, out.str());
+ m_detailedTranslationCollector->Write(translationId, out.str());
}
-void IOWrapper::OutputBestHypo(search::Applied applied, long translationId)
+void IOWrapper::OutputUnknowns(const std::vector<Moses::Phrase*> &unknowns,
+ long translationId)
{
- if (!m_singleBestOutputCollector) return;
std::ostringstream out;
- IOWrapper::FixPrecision(out);
- if (StaticData::Instance().GetOutputHypoScore()) {
- out << applied.GetScore() << ' ';
- }
- Phrase outPhrase;
- Incremental::ToPhrase(applied, outPhrase);
- // delete 1st & last
- UTIL_THROW_IF2(outPhrase.GetSize() < 2,
- "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
- outPhrase.RemoveWord(0);
- outPhrase.RemoveWord(outPhrase.GetSize() - 1);
- out << outPhrase.GetStringRep(StaticData::Instance().GetOutputFactorOrder());
- out << '\n';
- m_singleBestOutputCollector->Write(translationId, out.str());
-
- VERBOSE(1,"BEST TRANSLATION: " << outPhrase << "[total=" << applied.GetScore() << "]" << endl);
-}
-
-void IOWrapper::OutputBestNone(long translationId)
-{
- if (!m_singleBestOutputCollector) return;
- if (StaticData::Instance().GetOutputHypoScore()) {
- m_singleBestOutputCollector->Write(translationId, "0 \n");
- } else {
- m_singleBestOutputCollector->Write(translationId, "\n");
- }
-}
-
-void IOWrapper::OutputAllFeatureScores(const ScoreComponentCollection &features, std::ostream &out)
-{
- std::string lastName = "";
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for( size_t i=0; i<sff.size(); i++ ) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->GetScoreProducerDescription() != "BleuScoreFeature"
- && ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- for( size_t i=0; i<slf.size(); i++ ) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
-} // namespace
-
-void IOWrapper::OutputFeatureScores( std::ostream& out, const ScoreComponentCollection &features, const FeatureFunction *ff, std::string &lastName )
-{
- const StaticData &staticData = StaticData::Instance();
- bool labeledOutput = staticData.IsLabeledNBestList();
-
- // regular features (not sparse)
- if (ff->GetNumScoreComponents() != 0) {
- if( labeledOutput && lastName != ff->GetScoreProducerDescription() ) {
- lastName = ff->GetScoreProducerDescription();
- out << " " << lastName << "=";
- }
- vector<float> scores = features.GetScoresForProducer( ff );
- for (size_t j = 0; j<scores.size(); ++j) {
- out << " " << scores[j];
- }
- }
-
- // sparse features
- const FVector scores = features.GetVectorForProducer( ff );
- for(FVector::FNVmap::const_iterator i = scores.cbegin(); i != scores.cend(); i++) {
- out << " " << i->first << "= " << i->second;
+ for (std::size_t i = 0; i < unknowns.size(); ++i) {
+ out << *(unknowns[i]);
}
+ out << std::endl;
+ m_unknownsCollector->Write(translationId, out.str());
}
void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList,
@@ -706,7 +854,7 @@ void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList,
if (m_nBestOutputCollector->OutputIsCout()) {
// Set precision only if we're writing the n-best list to cout. This is to
// preserve existing behaviour, but should probably be done either way.
- IOWrapper::FixPrecision(out);
+ FixPrecision(out);
}
bool includeWordAlignment =
@@ -758,65 +906,6 @@ void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList,
m_nBestOutputCollector->Write(translationId, out.str());
}
-void IOWrapper::OutputNBestList(const std::vector<search::Applied> &nbest, long translationId)
-{
- std::ostringstream out;
- // wtf? copied from the original OutputNBestList
- if (m_nBestOutputCollector->OutputIsCout()) {
- IOWrapper::FixPrecision(out);
- }
- Phrase outputPhrase;
- ScoreComponentCollection features;
- for (std::vector<search::Applied>::const_iterator i = nbest.begin(); i != nbest.end(); ++i) {
- Incremental::PhraseAndFeatures(*i, outputPhrase, features);
- // <s> and </s>
- UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
- "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
-
- outputPhrase.RemoveWord(0);
- outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
- out << translationId << " ||| ";
- OutputSurface(out, outputPhrase, m_outputFactorOrder, false);
- out << " ||| ";
- OutputAllFeatureScores(features, out);
- out << " ||| " << i->GetScore() << '\n';
- }
- out << std::flush;
- assert(m_nBestOutputCollector);
- m_nBestOutputCollector->Write(translationId, out.str());
-}
-
-void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
-{
- stream.setf(std::ios::fixed);
- stream.precision(size);
-}
-
-template <class T>
-void ShiftOffsets(vector<T> &offsets, T shift)
-{
- T currPos = shift;
- for (size_t i = 0; i < offsets.size(); ++i) {
- if (offsets[i] == 0) {
- offsets[i] = currPos;
- ++currPos;
- } else {
- currPos += offsets[i];
- }
- }
-}
-
-size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
-{
- size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
- const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
- for (size_t i = 0; i < prevHypos.size(); ++i) {
- size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
- ret -= (childSize - 1);
- }
- return ret;
-}
-
size_t IOWrapper::OutputAlignmentNBest(
Alignments &retAlign,
const Moses::ChartKBestExtractor::Derivation &derivation,
@@ -896,76 +985,608 @@ size_t IOWrapper::OutputAlignmentNBest(
return totalTargetSize;
}
-void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothesis *hypo)
+//////////////////////////////////////////////////////////////////////////
+/***
+ * print surface factor only for the given phrase
+ */
+void IOWrapper::OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
+ char reportSegmentation, bool reportAllFactors)
+{
+ UTIL_THROW_IF2(outputFactorOrder.size() == 0,
+ "Must specific at least 1 output factor");
+ const TargetPhrase& phrase = edge.GetCurrTargetPhrase();
+ bool markUnknown = StaticData::Instance().GetMarkUnknown();
+ if (reportAllFactors == true) {
+ out << phrase;
+ } else {
+ FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
+
+ std::map<size_t, const Factor*> placeholders;
+ if (placeholderFactor != NOT_FOUND) {
+ // creates map of target position -> factor for placeholders
+ placeholders = GetPlaceholders(edge, placeholderFactor);
+ }
+
+ size_t size = phrase.GetSize();
+ for (size_t pos = 0 ; pos < size ; pos++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
+
+ if (placeholders.size()) {
+ // do placeholders
+ std::map<size_t, const Factor*>::const_iterator iter = placeholders.find(pos);
+ if (iter != placeholders.end()) {
+ factor = iter->second;
+ }
+ }
+
+ UTIL_THROW_IF2(factor == NULL,
+ "No factor 0 at position " << pos);
+
+ //preface surface form with UNK if marking unknowns
+ const Word &word = phrase.GetWord(pos);
+ if(markUnknown && word.IsOOV()) {
+ out << "UNK" << *factor;
+ } else {
+ out << *factor;
+ }
+
+ for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
+ UTIL_THROW_IF2(factor == NULL,
+ "No factor " << i << " at position " << pos);
+
+ out << "|" << *factor;
+ }
+ out << " ";
+ }
+ }
+
+ // trace ("report segmentation") option "-t" / "-tt"
+ if (reportSegmentation > 0 && phrase.GetSize() > 0) {
+ const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
+ const int sourceStart = sourceRange.GetStartPos();
+ const int sourceEnd = sourceRange.GetEndPos();
+ out << "|" << sourceStart << "-" << sourceEnd; // enriched "-tt"
+ if (reportSegmentation == 2) {
+ out << ",wa=";
+ const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
+ OutputAlignment(out, ai, 0, 0);
+ out << ",total=";
+ out << edge.GetScore() - edge.GetPrevHypo()->GetScore();
+ out << ",";
+ ScoreComponentCollection scoreBreakdown(edge.GetScoreBreakdown());
+ scoreBreakdown.MinusEquals(edge.GetPrevHypo()->GetScoreBreakdown());
+ OutputAllFeatureScores(scoreBreakdown, out);
+ }
+ out << "| ";
+ }
+}
+
+void IOWrapper::OutputBestSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder,
+ char reportSegmentation, bool reportAllFactors)
+{
+ if (hypo != NULL) {
+ // recursively retrace this best path through the lattice, starting from the end of the hypothesis sentence
+ OutputBestSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors);
+ OutputSurface(out, *hypo, outputFactorOrder, reportSegmentation, reportAllFactors);
+ }
+}
+
+void IOWrapper::OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset)
+{
+ typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
+ AlignVec alignments = ai.GetSortedAlignments();
+
+ AlignVec::const_iterator it;
+ for (it = alignments.begin(); it != alignments.end(); ++it) {
+ const std::pair<size_t,size_t> &alignment = **it;
+ out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
+ }
+
+}
+
+void IOWrapper::OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
+{
+ size_t targetOffset = 0;
+
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ const TargetPhrase &tp = edge.GetCurrTargetPhrase();
+ size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
+
+ OutputAlignment(out, tp.GetAlignTerm(), sourceOffset, targetOffset);
+
+ targetOffset += tp.GetSize();
+ }
+ // Removing std::endl here breaks -alignment-output-file, so stop doing that, please :)
+ // Or fix it somewhere else.
+ out << std::endl;
+}
+
+void IOWrapper::OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
+{
+ std::vector<const Hypothesis *> edges;
+ const Hypothesis *currentHypo = hypo;
+ while (currentHypo) {
+ edges.push_back(currentHypo);
+ currentHypo = currentHypo->GetPrevHypo();
+ }
+
+ OutputAlignment(out, edges);
+
+}
+
+void IOWrapper::OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
{
ostringstream out;
+ OutputAlignment(out, edges);
- if (hypo) {
- Alignments retAlign;
- OutputAlignment(retAlign, hypo, 0);
+ collector->Write(lineNo,out.str());
+}
- // output alignments
- Alignments::const_iterator iter;
- for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
- const pair<size_t, size_t> &alignPoint = *iter;
- out << alignPoint.first << "-" << alignPoint.second << " ";
+void IOWrapper::OutputAlignment(OutputCollector* collector, size_t lineNo , const Hypothesis *hypo)
+{
+ if (collector) {
+ std::vector<const Hypothesis *> edges;
+ const Hypothesis *currentHypo = hypo;
+ while (currentHypo) {
+ edges.push_back(currentHypo);
+ currentHypo = currentHypo->GetPrevHypo();
}
+
+ OutputAlignment(collector,lineNo, edges);
+ }
+}
+
+void IOWrapper::OutputAlignment(OutputCollector* collector, size_t lineNo , const TrellisPath &path)
+{
+ if (collector) {
+ OutputAlignment(collector,lineNo, path.GetEdges());
+ }
+}
+
+void IOWrapper::OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/, char reportSegmentation, bool reportAllFactors, std::ostream &out)
+{
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
+
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ OutputSurface(out, edge, StaticData::Instance().GetOutputFactorOrder(), reportSegmentation, reportAllFactors);
}
out << endl;
+}
- m_alignmentInfoCollector->Write(translationId, out.str());
+void IOWrapper::Backtrack(const Hypothesis *hypo)
+{
+
+ if (hypo->GetPrevHypo() != NULL) {
+ VERBOSE(3,hypo->GetId() << " <= ");
+ Backtrack(hypo->GetPrevHypo());
+ }
}
-void IOWrapper::OutputUnknowns(const std::vector<Moses::Phrase*> &unknowns,
+void IOWrapper::OutputBestHypo(const std::vector<Word>& mbrBestHypo, long /*translationId*/, char /*reportSegmentation*/, bool /*reportAllFactors*/, ostream& out)
+{
+
+ for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
+ const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
+ UTIL_THROW_IF2(factor == NULL,
+ "No factor 0 at position " << i);
+ if (i>0) out << " " << *factor;
+ else out << *factor;
+ }
+ out << endl;
+}
+
+
+void IOWrapper::OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
+{
+ if (hypo->GetPrevHypo()) {
+ OutputInput(map, hypo->GetPrevHypo());
+ map[hypo->GetCurrSourceWordsRange().GetStartPos()] = &hypo->GetTranslationOption().GetInputPath().GetPhrase();
+ }
+}
+
+void IOWrapper::OutputInput(std::ostream& os, const Hypothesis* hypo)
+{
+ size_t len = hypo->GetInput().GetSize();
+ std::vector<const Phrase*> inp_phrases(len, 0);
+ OutputInput(inp_phrases, hypo);
+ for (size_t i=0; i<len; ++i)
+ if (inp_phrases[i]) os << *inp_phrases[i];
+}
+
+void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long /*translationId*/, char reportSegmentation, bool reportAllFactors)
+{
+ if (hypo != NULL) {
+ VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
+ VERBOSE(3,"Best path: ");
+ Backtrack(hypo);
+ VERBOSE(3,"0" << std::endl);
+ if (!m_surpressSingleBestOutput) {
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ cout << hypo->GetTotalScore() << " ";
+ }
+
+ if (StaticData::Instance().IsPathRecoveryEnabled()) {
+ OutputInput(cout, hypo);
+ cout << "||| ";
+ }
+ OutputBestSurface(cout, hypo, m_outputFactorOrder, reportSegmentation, reportAllFactors);
+ cout << endl;
+ }
+ } else {
+ VERBOSE(1, "NO BEST TRANSLATION" << endl);
+ if (!m_surpressSingleBestOutput) {
+ cout << endl;
+ }
+ }
+}
+
+bool IOWrapper::ReadInput(InputTypeEnum inputType, InputType*& source)
+{
+ delete source;
+ switch(inputType) {
+ case SentenceInput:
+ source = GetInput(new Sentence);
+ break;
+ case ConfusionNetworkInput:
+ source = GetInput(new ConfusionNet);
+ break;
+ case WordLatticeInput:
+ source = GetInput(new WordLattice);
+ break;
+ case TreeInputType:
+ source = GetInput(new TreeInput);
+ break;
+ default:
+ TRACE_ERR("Unknown input type: " << inputType << "\n");
+ }
+ return (source ? true : false);
+}
+
+void IOWrapper::OutputNBest(std::ostream& out
+ , const Moses::TrellisPathList &nBestList
+ , const std::vector<Moses::FactorType>& outputFactorOrder
+ , long translationId
+ , char reportSegmentation)
+{
+ const StaticData &staticData = StaticData::Instance();
+ bool reportAllFactors = staticData.GetReportAllFactorsNBest();
+ bool includeSegmentation = staticData.NBestIncludesSegmentation();
+ bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
+
+ TrellisPathList::const_iterator iter;
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
+ const TrellisPath &path = **iter;
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
+
+ // print the surface factor of the translation
+ out << translationId << " ||| ";
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ OutputSurface(out, edge, outputFactorOrder, reportSegmentation, reportAllFactors);
+ }
+ out << " |||";
+
+ // print scores with feature names
+ OutputAllFeatureScores(path.GetScoreBreakdown(), out );
+
+ // total
+ out << " ||| " << path.GetTotalScore();
+
+ //phrase-to-phrase segmentation
+ if (includeSegmentation) {
+ out << " |||";
+ for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
+ WordsRange targetRange = path.GetTargetWordsRange(edge);
+ out << " " << sourceRange.GetStartPos();
+ if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
+ out << "-" << sourceRange.GetEndPos();
+ }
+ out<< "=" << targetRange.GetStartPos();
+ if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
+ out<< "-" << targetRange.GetEndPos();
+ }
+ }
+ }
+
+ if (includeWordAlignment) {
+ out << " ||| ";
+ for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
+ WordsRange targetRange = path.GetTargetWordsRange(edge);
+ const int sourceOffset = sourceRange.GetStartPos();
+ const int targetOffset = targetRange.GetStartPos();
+ const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
+
+ OutputAlignment(out, ai, sourceOffset, targetOffset);
+
+ }
+ }
+
+ if (StaticData::Instance().IsPathRecoveryEnabled()) {
+ out << " ||| ";
+ OutputInput(out, edges[0]);
+ }
+
+ out << endl;
+ }
+
+ out << std::flush;
+}
+
+void IOWrapper::OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
+ , std::ostream &out)
+{
+ std::string lastName = "";
+ const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for( size_t i=0; i<sff.size(); i++ ) {
+ const StatefulFeatureFunction *ff = sff[i];
+ if (ff->GetScoreProducerDescription() != "BleuScoreFeature"
+ && ff->IsTuneable()) {
+ OutputFeatureScores( out, features, ff, lastName );
+ }
+ }
+ const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ for( size_t i=0; i<slf.size(); i++ ) {
+ const StatelessFeatureFunction *ff = slf[i];
+ if (ff->IsTuneable()) {
+ OutputFeatureScores( out, features, ff, lastName );
+ }
+ }
+}
+
+void IOWrapper::OutputFeatureScores( std::ostream& out
+ , const ScoreComponentCollection &features
+ , const FeatureFunction *ff
+ , std::string &lastName )
+{
+ const StaticData &staticData = StaticData::Instance();
+ bool labeledOutput = staticData.IsLabeledNBestList();
+
+ // regular features (not sparse)
+ if (ff->GetNumScoreComponents() != 0) {
+ if( labeledOutput && lastName != ff->GetScoreProducerDescription() ) {
+ lastName = ff->GetScoreProducerDescription();
+ out << " " << lastName << "=";
+ }
+ vector<float> scores = features.GetScoresForProducer( ff );
+ for (size_t j = 0; j<scores.size(); ++j) {
+ out << " " << scores[j];
+ }
+ }
+
+ // sparse features
+ const FVector scores = features.GetVectorForProducer( ff );
+ for(FVector::FNVmap::const_iterator i = scores.cbegin(); i != scores.cend(); i++) {
+ out << " " << i->first << "= " << i->second;
+ }
+}
+
+void IOWrapper::OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>& solutions,long translationId)
+{
+ for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
+ out << translationId;
+ out << " |||";
+ const vector<Word> mbrHypo = si->GetWords();
+ for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
+ const Factor *factor = mbrHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
+ if (i>0) out << " " << *factor;
+ else out << *factor;
+ }
+ out << " |||";
+ out << " map: " << si->GetMapScore();
+ out << " w: " << mbrHypo.size();
+ const vector<float>& ngramScores = si->GetNgramScores();
+ for (size_t i = 0; i < ngramScores.size(); ++i) {
+ out << " " << ngramScores[i];
+ }
+ out << " ||| " << si->GetScore();
+
+ out << endl;
+ }
+}
+
+
+void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solutions,long translationId)
+{
+ OutputLatticeMBRNBest(*m_nBestStream, solutions,translationId);
+}
+
+IOWrapper *IOWrapper::GetIOWrapper(const StaticData &staticData)
+{
+ IOWrapper *ioWrapper;
+ const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
+ ,&outputFactorOrder = staticData.GetOutputFactorOrder();
+ FactorMask inputFactorUsed(inputFactorOrder);
+
+ // io
+ string inputPath;
+ if (staticData.GetParam("input-file").size() == 1) {
+ VERBOSE(2,"IO from File" << endl);
+ inputPath = staticData.GetParam("input-file")[0];
+ }
+ ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
+ , staticData.GetNBestSize()
+ , staticData.GetNBestFilePath()
+ , inputPath);
+
+ IFVERBOSE(1)
+ PrintUserTime("Created input-output object");
+
+ return ioWrapper;
+}
+
+
+////////////////////////////
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/S2T/DerivationWriter.h"
+
+void IOWrapper::OutputDetailedTranslationReport(const Syntax::SHyperedge *best,
+ long translationId)
+{
+ if (best == NULL) {
+ return;
+ }
+ std::ostringstream out;
+ Syntax::S2T::DerivationWriter::Write(*best, translationId, out);
+ UTIL_THROW_IF2(m_detailedTranslationCollector == NULL,
+ "No ouput file for detailed reports specified");
+ m_detailedTranslationCollector->Write(translationId, out.str());
+}
+
+void IOWrapper::OutputBestHypo(const Syntax::SHyperedge *best,
long translationId)
{
+ if (!m_singleBestOutputCollector) {
+ return;
+ }
std::ostringstream out;
- for (std::size_t i = 0; i < unknowns.size(); ++i) {
- out << *(unknowns[i]);
+ IOWrapper::FixPrecision(out);
+ if (best == NULL) {
+ VERBOSE(1, "NO BEST TRANSLATION" << std::endl);
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << "0 ";
+ }
+ } else {
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << best->score << " ";
+ }
+ Phrase yield = Syntax::GetOneBestTargetYield(*best);
+ // delete 1st & last
+ UTIL_THROW_IF2(yield.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+ yield.RemoveWord(0);
+ yield.RemoveWord(yield.GetSize()-1);
+ out << yield.GetStringRep(StaticData::Instance().GetOutputFactorOrder());
+ out << '\n';
}
- out << std::endl;
- m_unknownsCollector->Write(translationId, out.str());
+ m_singleBestOutputCollector->Write(translationId, out.str());
}
-size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget)
+void IOWrapper::OutputNBestList(
+ const Syntax::KBestExtractor::KBestVec &nBestList, long translationId)
+{
+ std::ostringstream out;
+
+ if (m_nBestOutputCollector->OutputIsCout()) {
+ // Set precision only if we're writing the n-best list to cout. This is to
+ // preserve existing behaviour, but should probably be done either way.
+ IOWrapper::FixPrecision(out);
+ }
+
+ bool includeWordAlignment =
+ StaticData::Instance().PrintAlignmentInfoInNbest();
+
+ bool PrintNBestTrees = StaticData::Instance().PrintNBestTrees();
+
+ for (Syntax::KBestExtractor::KBestVec::const_iterator p = nBestList.begin();
+ p != nBestList.end(); ++p) {
+ const Syntax::KBestExtractor::Derivation &derivation = **p;
+
+ // get the derivation's target-side yield
+ Phrase outputPhrase = Syntax::KBestExtractor::GetOutputPhrase(derivation);
+
+ // delete <s> and </s>
+ UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+ outputPhrase.RemoveWord(0);
+ outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
+
+ // print the translation ID, surface factors, and scores
+ out << translationId << " ||| ";
+ OutputSurface(out, outputPhrase, m_outputFactorOrder, false);
+ out << " ||| ";
+ OutputAllFeatureScores(derivation.scoreBreakdown, out);
+ out << " ||| " << derivation.score;
+
+ // optionally, print word alignments
+ if (includeWordAlignment) {
+ out << " ||| ";
+ Alignments align;
+ OutputAlignmentNBest(align, derivation, 0);
+ for (Alignments::const_iterator q = align.begin(); q != align.end();
+ ++q) {
+ out << q->first << "-" << q->second << " ";
+ }
+ }
+
+ // optionally, print tree
+ if (PrintNBestTrees) {
+ TreePointer tree = Syntax::KBestExtractor::GetOutputTree(derivation);
+ out << " ||| " << tree->GetString();
+ }
+
+ out << std::endl;
+ }
+
+ assert(m_nBestOutputCollector);
+ m_nBestOutputCollector->Write(translationId, out.str());
+}
+
+size_t IOWrapper::CalcSourceSize(const Syntax::KBestExtractor::Derivation &d) const
+{
+ using namespace Moses::Syntax;
+
+ const Syntax::SHyperedge &shyperedge = d.edge->shyperedge;
+ size_t ret = shyperedge.head->pvertex->span.GetNumWordsCovered();
+ for (size_t i = 0; i < shyperedge.tail.size(); ++i) {
+ size_t childSize = shyperedge.tail[i]->pvertex->span.GetNumWordsCovered();
+ ret -= (childSize - 1);
+ }
+ return ret;
+}
+
+size_t IOWrapper::OutputAlignmentNBest(
+ Alignments &retAlign,
+ const Syntax::KBestExtractor::Derivation &derivation,
+ size_t startTarget)
{
+ const Syntax::SHyperedge &shyperedge = derivation.edge->shyperedge;
+
size_t totalTargetSize = 0;
- size_t startSource = hypo->GetCurrSourceRange().GetStartPos();
+ size_t startSource = shyperedge.head->pvertex->span.GetStartPos();
- const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
+ const TargetPhrase &tp = *(shyperedge.translation);
- size_t thisSourceSize = CalcSourceSize(hypo);
+ size_t thisSourceSize = CalcSourceSize(derivation);
// position of each terminal word in translation rule, irrespective of alignment
// if non-term, number is undefined
vector<size_t> sourceOffsets(thisSourceSize, 0);
vector<size_t> targetOffsets(tp.GetSize(), 0);
- const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
-
- const AlignmentInfo &aiNonTerm = hypo->GetCurrTargetPhrase().GetAlignNonTerm();
+ const AlignmentInfo &aiNonTerm = shyperedge.translation->GetAlignNonTerm();
vector<size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap();
- UTIL_THROW_IF2(sourceInd2pos.size() != prevHypos.size(), "Error");
+ UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(),
+ "Error");
size_t targetInd = 0;
for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
if (tp.GetWord(targetPos).IsNonTerminal()) {
- UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
+ UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
size_t sourceInd = targetPos2SourceInd[targetPos];
size_t sourcePos = sourceInd2pos[sourceInd];
- const ChartHypothesis *prevHypo = prevHypos[sourceInd];
+ const Moses::Syntax::KBestExtractor::Derivation &subderivation =
+ *derivation.subderivations[sourceInd];
// calc source size
- size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
+ size_t sourceSize =
+ subderivation.edge->head->svertex.pvertex->span.GetNumWordsCovered();
sourceOffsets[sourcePos] = sourceSize;
// calc target size.
// Recursively look thru child hypos
size_t currStartTarget = startTarget + totalTargetSize;
- size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
+ size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
+ currStartTarget);
targetOffsets[targetPos] = targetSize;
totalTargetSize += targetSize;
@@ -981,7 +1602,7 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
ShiftOffsets(targetOffsets, startTarget);
// get alignments from this hypo
- const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
+ const AlignmentInfo &aiTerm = shyperedge.translation->GetAlignTerm();
// add to output arg, offsetting by source & target
AlignmentInfo::const_iterator iter;
@@ -995,26 +1616,22 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
pair<size_t, size_t> alignPoint(absSource, absTarget);
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
UTIL_THROW_IF2(!ret.second, "Error");
-
}
return totalTargetSize;
}
-void IOWrapper::OutputAlignment(vector< set<size_t> > &retAlignmentsS2T, const AlignmentInfo &ai)
+void IOWrapper::OutputUnknowns(const std::set<Moses::Word> &unknowns,
+ long translationId)
{
- typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
- AlignVec alignments = ai.GetSortedAlignments();
-
- AlignVec::const_iterator it;
- for (it = alignments.begin(); it != alignments.end(); ++it) {
- const std::pair<size_t,size_t> &alignPoint = **it;
-
- UTIL_THROW_IF2(alignPoint.first >= retAlignmentsS2T.size(), "Error");
- pair<set<size_t>::iterator, bool> ret = retAlignmentsS2T[alignPoint.first].insert(alignPoint.second);
- UTIL_THROW_IF2(!ret.second, "Error");
+ std::ostringstream out;
+ for (std::set<Moses::Word>::const_iterator p = unknowns.begin();
+ p != unknowns.end(); ++p) {
+ out << *p;
}
+ out << std::endl;
+ m_unknownsCollector->Write(translationId, out.str());
}
-}
+} // namespace
diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h
new file mode 100644
index 000000000..4253871b3
--- /dev/null
+++ b/moses/IOWrapper.h
@@ -0,0 +1,279 @@
+// $Id$
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2006 University of Edinburgh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of the University of Edinburgh nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+// example file on how to use moses library
+
+#pragma once
+
+#include <cassert>
+#include <fstream>
+#include <ostream>
+#include <vector>
+
+#include "moses/TypeDef.h"
+#include "moses/Sentence.h"
+#include "moses/FactorTypeSet.h"
+#include "moses/FactorCollection.h"
+#include "moses/Hypothesis.h"
+#include "moses/OutputCollector.h"
+#include "moses/TrellisPathList.h"
+#include "moses/InputFileStream.h"
+#include "moses/InputType.h"
+#include "moses/WordLattice.h"
+#include "moses/LatticeMBR.h"
+#include "moses/ChartKBestExtractor.h"
+#include "moses/Syntax/KBestExtractor.h"
+
+#include "search/applied.hh"
+
+namespace Moses
+{
+class ScoreComponentCollection;
+class Hypothesis;
+class ChartHypothesis;
+class Factor;
+
+namespace Syntax
+{
+struct SHyperedge;
+}
+
+/** Helper class that holds misc variables to write data out to command line.
+ */
+class IOWrapper
+{
+protected:
+
+ const std::vector<Moses::FactorType> &m_inputFactorOrder;
+ const std::vector<Moses::FactorType> &m_outputFactorOrder;
+ const Moses::FactorMask &m_inputFactorUsed;
+ std::string m_inputFilePath;
+ Moses::InputFileStream *m_inputFile;
+ std::istream *m_inputStream;
+ std::ostream *m_nBestStream;
+ std::ostream *m_outputWordGraphStream;
+ std::ostream *m_detailedTranslationReportingStream;
+ std::ofstream *m_alignmentInfoStream;
+ std::ostream *m_unknownsStream;
+ std::ostream *m_outputSearchGraphStream;
+ std::ofstream *m_latticeSamplesStream;
+ std::ostream *m_detailedTreeFragmentsTranslationReportingStream;
+
+ bool m_surpressSingleBestOutput;
+
+ Moses::OutputCollector *m_singleBestOutputCollector;
+ Moses::OutputCollector *m_nBestOutputCollector;
+ Moses::OutputCollector *m_unknownsCollector;
+ Moses::OutputCollector *m_alignmentInfoCollector;
+ Moses::OutputCollector *m_searchGraphOutputCollector;
+ Moses::OutputCollector *m_detailedTranslationCollector;
+ Moses::OutputCollector *m_wordGraphCollector;
+ Moses::OutputCollector *m_latticeSamplesCollector;
+ Moses::OutputCollector *m_detailTreeFragmentsOutputCollector;
+
+ // CHART
+ typedef std::vector<std::pair<Moses::Word, Moses::WordsRange> > ApplicationContext;
+ typedef std::set< std::pair<size_t, size_t> > Alignments;
+
+ void Backtrack(const ChartHypothesis *hypo);
+ void OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
+ void OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
+ void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
+ void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
+
+ void ReconstructApplicationContext(const Moses::ChartHypothesis &hypo,
+ const Moses::Sentence &sentence,
+ ApplicationContext &context);
+ void ReconstructApplicationContext(const search::Applied *applied,
+ const Moses::Sentence &sentence,
+ ApplicationContext &context);
+ void WriteApplicationContext(std::ostream &out,
+ const ApplicationContext &context);
+ void OutputTreeFragmentsTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const Moses::ChartHypothesis *hypo,
+ const Moses::Sentence &sentence,
+ long translationId);
+ void OutputTreeFragmentsTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const search::Applied *applied,
+ const Moses::Sentence &sentence,
+ long translationId);
+
+ void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<FactorType> &outputFactorOrder, bool reportAllFactors);
+ void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
+ char reportSegmentation, bool reportAllFactors);
+
+ size_t OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget);
+ size_t OutputAlignmentNBest(Alignments &retAlign,
+ const Moses::ChartKBestExtractor::Derivation &derivation,
+ size_t startTarget);
+ std::size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::Syntax::KBestExtractor::Derivation &derivation, std::size_t startTarget);
+
+ size_t CalcSourceSize(const Moses::ChartHypothesis *hypo);
+ size_t CalcSourceSize(const Syntax::KBestExtractor::Derivation &d) const;
+
+ template <class T>
+ void ShiftOffsets(std::vector<T> &offsets, T shift)
+ {
+ T currPos = shift;
+ for (size_t i = 0; i < offsets.size(); ++i) {
+ if (offsets[i] == 0) {
+ offsets[i] = currPos;
+ ++currPos;
+ } else {
+ currPos += offsets[i];
+ }
+ }
+ }
+
+public:
+ static IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
+ static void FixPrecision(std::ostream &, size_t size=3);
+
+ IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
+ , const std::vector<Moses::FactorType> &outputFactorOrder
+ , const Moses::FactorMask &inputFactorUsed
+ , size_t nBestSize
+ , const std::string &nBestFilePath
+ , const std::string &inputFilePath = "");
+ ~IOWrapper();
+
+ Moses::InputType* GetInput(Moses::InputType *inputType);
+ bool ReadInput(Moses::InputTypeEnum inputType, Moses::InputType*& source);
+
+ void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, char reportSegmentation, bool reportAllFactors);
+ void OutputLatticeMBRNBestList(const std::vector<LatticeMBRSolution>& solutions,long translationId);
+ void Backtrack(const Moses::Hypothesis *hypo);
+
+ Moses::OutputCollector *GetSingleBestOutputCollector() {
+ return m_singleBestOutputCollector;
+ }
+
+ Moses::OutputCollector *GetNBestOutputCollector() {
+ return m_nBestOutputCollector;
+ }
+
+ Moses::OutputCollector *GetUnknownsCollector() {
+ return m_unknownsCollector;
+ }
+
+ Moses::OutputCollector *GetAlignmentInfoCollector() {
+ return m_alignmentInfoCollector;
+ }
+
+ Moses::OutputCollector *GetSearchGraphOutputCollector() {
+ return m_searchGraphOutputCollector;
+ }
+
+ Moses::OutputCollector *GetDetailedTranslationCollector() {
+ return m_detailedTranslationCollector;
+ }
+
+ Moses::OutputCollector *GetWordGraphCollector() {
+ return m_wordGraphCollector;
+ }
+
+ Moses::OutputCollector *GetLatticeSamplesCollector() {
+ return m_latticeSamplesCollector;
+ }
+
+ // CHART
+ void OutputBestHypo(const Moses::ChartHypothesis *hypo, long translationId);
+ void OutputBestHypo(search::Applied applied, long translationId);
+ void OutputBestHypo(const Moses::Syntax::SHyperedge *, long translationId);
+
+ void OutputBestNone(long translationId);
+
+ void OutputNBestList(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, long translationId);
+ void OutputNBestList(const std::vector<search::Applied> &nbest, long translationId);
+ void OutputNBestList(const Moses::Syntax::KBestExtractor::KBestVec &nBestList, long translationId);
+
+ void OutputDetailedTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
+ void OutputDetailedTranslationReport(const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
+ void OutputDetailedTranslationReport(const Moses::Syntax::SHyperedge *, long translationId);
+
+ void OutputDetailedAllTranslationReport(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, const Moses::ChartManager &manager, const Moses::Sentence &sentence, long translationId);
+
+ void OutputAlignment(size_t translationId , const Moses::ChartHypothesis *hypo);
+ void OutputUnknowns(const std::vector<Moses::Phrase*> &, long);
+ void OutputUnknowns(const std::set<Moses::Word> &, long);
+
+ void OutputDetailedTreeFragmentsTranslationReport(const Moses::ChartHypothesis *hypo,
+ const Moses::Sentence &sentence,
+ long translationId);
+ void OutputDetailedTreeFragmentsTranslationReport(const search::Applied *applied,
+ const Moses::Sentence &sentence,
+ long translationId);
+
+ // phrase-based
+ void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder, char reportSegmentation, bool reportAllFactors);
+ void OutputLatticeMBRNBest(std::ostream& out, const std::vector<LatticeMBRSolution>& solutions,long translationId);
+ void OutputBestHypo(const std::vector<Moses::Word>& mbrBestHypo, long /*translationId*/,
+ char reportSegmentation, bool reportAllFactors, std::ostream& out);
+ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,char reportSegmentation, bool reportAllFactors, std::ostream &out);
+ void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
+ void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo);
+
+ void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
+ void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
+ void OutputAlignment(OutputCollector* collector, size_t lineNo , const std::vector<const Hypothesis *> &edges);
+
+ static void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
+ static void OutputAlignment(std::ostream &out, const std::vector<const Hypothesis *> &edges);
+ static void OutputAlignment(std::ostream &out, const Moses::AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset);
+
+ void OutputNBest(std::ostream& out
+ , const Moses::TrellisPathList &nBestList
+ , const std::vector<Moses::FactorType>& outputFactorOrder
+ , long translationId
+ , char reportSegmentation);
+
+ static void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
+ , std::ostream &out);
+ static void OutputFeatureScores( std::ostream& out
+ , const Moses::ScoreComponentCollection &features
+ , const Moses::FeatureFunction *ff
+ , std::string &lastName );
+
+ // creates a map of TARGET positions which should be replaced by word using placeholder
+ std::map<size_t, const Moses::Factor*> GetPlaceholders(const Moses::Hypothesis &hypo, Moses::FactorType placeholderFactor);
+
+ // post editing
+ std::ifstream *spe_src, *spe_trg, *spe_aln;
+
+};
+
+
+
+}
+
diff --git a/moses/Jamfile b/moses/Jamfile
index d44ac5277..3be041e60 100644
--- a/moses/Jamfile
+++ b/moses/Jamfile
@@ -10,14 +10,14 @@ if $(with-dlib) {
dlib = ;
}
-with-lbllm = [ option.get "with-lbllm" ] ;
-if $(with-lbllm) {
- lbllm2 = <cxxflags>-std=c++0x <define>LM_LBL <include>$(with-lbllm)/src <include>$(with-lbllm)/third_party/eigen ;
+with-oxlm = [ option.get "with-oxlm" ] ;
+if $(with-oxlm) {
+ oxlm = <cxxflags>-std=c++0x <define>LM_OXLM <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
} else {
- lbllm2 = ;
+ oxlm = ;
}
-alias headers : ../util//kenutil : : : $(max-factors) $(dlib) $(lbllm2) ;
+alias headers : ../util//kenutil : : : $(max-factors) $(dlib) $(oxlm) ;
alias ThreadPool : ThreadPool.cpp ;
alias Util : Util.cpp Timer.cpp ;
@@ -63,6 +63,11 @@ if [ option.get "with-mm" : no : yes ] = yes
lib moses :
[ glob
*.cpp
+ Syntax/*.cpp
+ Syntax/S2T/*.cpp
+ Syntax/S2T/Parsers/*.cpp
+ Syntax/S2T/Parsers/RecursiveCYKPlusParser/*.cpp
+ Syntax/S2T/Parsers/Scope3Parser/*.cpp
TranslationModel/*.cpp
TranslationModel/fuzzy-match/*.cpp
TranslationModel/DynSAInclude/*.cpp
@@ -71,6 +76,7 @@ lib moses :
TranslationModel/CYKPlusParser/*.cpp
../phrase-extract/extract-ghkm/PhraseOrientation.cpp
FF/*.cpp
+ FF/bilingual-lm/*.cpp
FF/OSM-Feature/*.cpp
FF/LexicalReordering/*.cpp
PP/*.cpp
diff --git a/moses/LM/BilingualLM.cpp b/moses/LM/BilingualLM.cpp
new file mode 100644
index 000000000..7b05f88f8
--- /dev/null
+++ b/moses/LM/BilingualLM.cpp
@@ -0,0 +1,465 @@
+#include <vector>
+#include "BilingualLM.h"
+#include "moses/ScoreComponentCollection.h"
+
+using namespace std;
+
+namespace Moses {
+
+int BilingualLMState::Compare(const FFState& other) const
+{
+ const BilingualLMState &otherState = static_cast<const BilingualLMState&>(other);
+
+ if (m_hash == otherState.m_hash)
+ return 0;
+ return (m_hash < otherState.m_hash) ? -1 : +1;
+}
+
+////////////////////////////////////////////////////////////////
+BilingualLM::BilingualLM(const std::string &line)
+ : StatefulFeatureFunction(1, line),
+ word_factortype(0) {
+ FactorCollection& factorFactory = FactorCollection::Instance(); //Factor Factory to use for BOS_ and EOS_
+ BOS_factor = factorFactory.AddFactor(BOS_);
+ BOS_word.SetFactor(0, BOS_factor);
+ EOS_factor = factorFactory.AddFactor(EOS_);
+ EOS_word.SetFactor(0, EOS_factor);
+
+}
+
+void BilingualLM::Load(){
+ ReadParameters();
+ loadModel();
+}
+
+//Populates words with amount words from the targetPhrase from the previous hypothesis where
+//words[0] is the last word of the previous hypothesis, words[1] is the second last etc...
+void BilingualLM::requestPrevTargetNgrams(
+ const Hypothesis &cur_hypo, int amount, std::vector<int> &words) const {
+ const Hypothesis * prev_hyp = cur_hypo.GetPrevHypo();
+ int found = 0;
+
+ while (prev_hyp && found != amount) {
+ const TargetPhrase& currTargetPhrase = prev_hyp->GetCurrTargetPhrase();
+ for (int i = currTargetPhrase.GetSize() - 1; i> -1; i--){
+ if (found != amount){
+ const Word& word = currTargetPhrase.GetWord(i);
+ words[found] = getNeuralLMId(word, false);
+ found++;
+ } else {
+ return; //We have gotten everything needed
+ }
+ }
+
+ prev_hyp = prev_hyp->GetPrevHypo();
+ }
+
+ int neuralLM_wordID = getNeuralLMId(BOS_word, false);
+ for (int i = found; i < amount; i++){
+ words[i] = neuralLM_wordID;
+ }
+}
+
+//Populates the words vector with target_ngrams sized that also contains the current word we are looking at.
+//(in effect target_ngrams + 1)
+void BilingualLM::getTargetWords(
+ const Hypothesis &cur_hypo,
+ const TargetPhrase &targetPhrase,
+ int current_word_index,
+ std::vector<int> &words) const {
+ //Check if we need to look at previous target phrases
+ int additional_needed = current_word_index - target_ngrams;
+ if (additional_needed < 0) {
+ additional_needed = -additional_needed;
+ std::vector<int> prev_words(additional_needed);
+ requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
+ for (int i = additional_needed - 1; i >= 0; i--) {
+ words.push_back(prev_words[i]);
+ }
+ }
+
+ if (words.size() > 0) {
+ //We have added some words from previous phrases
+ //Just add until we reach current_word_index
+ for (int i = 0; i <= current_word_index; i++) {
+ const Word& word = targetPhrase.GetWord(i);
+ words.push_back(getNeuralLMId(word, false));
+ }
+ } else {
+ //We haven't added any words, proceed as before
+ for (int i = current_word_index - target_ngrams; i <= current_word_index; i++){
+ const Word& word = targetPhrase.GetWord(i);
+ words.push_back(getNeuralLMId(word, false));
+ }
+ }
+}
+
+//Returns source words in the way NeuralLM expects them.
+
+size_t BilingualLM::selectMiddleAlignment(
+ const set<size_t>& alignment_links) const {
+ assert(alignment_links.size() > 0);
+
+ set<size_t>::iterator it = alignment_links.begin();
+ for (int i = 0; i < (alignment_links.size() - 1) / 2; ++i) {
+ ++it;
+ }
+
+ return *it;
+}
+
+void BilingualLM::getSourceWords(
+ const TargetPhrase &targetPhrase,
+ int targetWordIdx,
+ const Sentence &source_sent,
+ const WordsRange &sourceWordRange,
+ std::vector<int> &words) const {
+ //Get source context
+
+ //Get alignment for the word we require
+ const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();
+
+ // We are getting word alignment for targetPhrase.GetWord(i + target_ngrams -1) according to the paper.
+ // Find the closest target word with alignment links.
+ std::set<size_t> last_word_al;
+ for (int j = 0; j < targetPhrase.GetSize(); j++) {
+ // Find the nearest aligned word with preference for right.
+ if ((targetWordIdx + j) < targetPhrase.GetSize()){
+ last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx + j);
+ if (!last_word_al.empty()) {
+ break;
+ }
+ }
+
+ // We couldn't find word on the right, try to the left.
+ if ((targetWordIdx - j) >= 0) {
+ last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx - j);
+ if (!last_word_al.empty()) {
+ break;
+ }
+ }
+ }
+
+ //Assume we have gotten some alignment here. If we couldn't get an alignment from the above routine it means
+ //that none of the words in the target phrase aligned to any word in the source phrase
+
+ // Now we get the source words. First select middle alignment.
+ size_t source_center_index = selectMiddleAlignment(last_word_al);
+ // We have found the alignment. Now determine how much to shift by to get the actual source word index.
+ size_t phrase_start_pos = sourceWordRange.GetStartPos();
+ // Account for how far the current word is from the start of the phrase.
+ size_t source_word_mid_idx = phrase_start_pos + source_center_index;
+
+ appendSourceWordsToVector(source_sent, words, source_word_mid_idx);
+}
+
+size_t BilingualLM::getState(const Hypothesis& cur_hypo) const {
+ const TargetPhrase &targetPhrase = cur_hypo.GetCurrTargetPhrase();
+ size_t hashCode = 0;
+
+ // Check if we need to look at previous target phrases
+ int additional_needed = targetPhrase.GetSize() - target_ngrams;
+ if (additional_needed < 0) {
+ additional_needed = -additional_needed;
+ std::vector<int> prev_words(additional_needed);
+ requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
+ for (int i = additional_needed - 1; i >= 0; i--) {
+ boost::hash_combine(hashCode, prev_words[i]);
+ }
+
+ // Get the rest of the phrases needed
+ for (int i = 0; i < targetPhrase.GetSize(); i++) {
+ const Word& word = targetPhrase.GetWord(i);
+ int neuralLM_wordID = getNeuralLMId(word, false);
+ boost::hash_combine(hashCode, neuralLM_wordID);
+ }
+ } else {
+ // We just need the last target_ngrams from the current target phrase.
+ for (int i = targetPhrase.GetSize() - target_ngrams; i < targetPhrase.GetSize(); i++) {
+ const Word& word = targetPhrase.GetWord(i);
+ int neuralLM_wordID = getNeuralLMId(word, false);
+
+ boost::hash_combine(hashCode, neuralLM_wordID);
+ }
+ }
+
+ return hashCode;
+}
+
+void BilingualLM::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const {}
+
+void BilingualLM::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
+{
+
+}
+
+
+FFState* BilingualLM::EvaluateWhenApplied(
+ const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const {
+ Manager& manager = cur_hypo.GetManager();
+ const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());
+
+ // Init vectors.
+ std::vector<int> source_words;
+ source_words.reserve(source_ngrams);
+ std::vector<int> target_words;
+ target_words.reserve(target_ngrams);
+
+ float value = 0;
+ const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();
+ const WordsRange& sourceWordRange = cur_hypo.GetCurrSourceWordsRange(); //Source words range to calculate offsets
+
+ // For each word in the current target phrase get its LM score.
+ for (int i = 0; i < currTargetPhrase.GetSize(); i++){
+ getSourceWords(
+ currTargetPhrase, i, source_sent, sourceWordRange, source_words);
+ getTargetWords(cur_hypo, currTargetPhrase, i, target_words);
+ value += Score(source_words, target_words);
+
+ // Clear the vectors.
+ source_words.clear();
+ target_words.clear();
+ }
+
+ size_t new_state = getState(cur_hypo);
+ accumulator->PlusEquals(this, value);
+
+ return new BilingualLMState(new_state);
+}
+
+void BilingualLM::getAllTargetIdsChart(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& wordIds) const {
+ const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();
+ int next_nonterminal_index = 0;
+
+ for (int i = 0; i < targetPhrase.GetSize(); i++){
+ if (targetPhrase.GetWord(i).IsNonTerminal()){ //Nonterminal get from prev state
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(next_nonterminal_index);
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int> prevWordIDs = prev_state->GetWordIdsVector();
+ for (std::vector<int>::const_iterator it = prevWordIDs.begin(); it!= prevWordIDs.end(); it++){
+ wordIds.push_back(*it);
+ }
+ next_nonterminal_index++;
+ } else {
+ wordIds.push_back(getNeuralLMId(targetPhrase.GetWord(i), false));
+ }
+ }
+}
+
+void BilingualLM::getAllAlignments(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& word_alignemnts) const {
+ const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();
+ int next_nonterminal_index = 0;
+ int source_phrase_start_pos = cur_hypo.GetCurrSourceRange().GetStartPos();
+ int source_word_mid_idx; //The word alignment
+
+ //Get source sent
+ const ChartManager& manager = cur_hypo.GetManager();
+ const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());
+ const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();
+
+ for (int i = 0; i < targetPhrase.GetSize(); i++){
+ //Sometimes we have to traverse more than one target words because of
+ //unaligned words. This is O(n^2) in worst case, but usually closer to O(n)
+ if (targetPhrase.GetWord(i).IsNonTerminal()){
+ //If we have a non terminal we can get the alignments from the previous state
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(next_nonterminal_index);
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int> prevWordAls = prev_state->GetWordAlignmentVector();
+ for (std::vector<int>::const_iterator it = prevWordAls.begin(); it!= prevWordAls.end(); it++){
+ word_alignemnts.push_back(*it);
+ }
+ next_nonterminal_index++;
+ } else {
+ std::set<size_t> word_al; //Keep word alignments
+ bool resolvedIndexis = false; //If we are aligning to an existing nonterm we don't need to calculate offsets
+ for (int j = 0; j < targetPhrase.GetSize(); j++){
+ //Try to get alignment from the current word and if it is unaligned,
+ //try from the first word to the right and then to the left
+ if ((i+j) < targetPhrase.GetSize()) {
+ if (targetPhrase.GetWord(i + j).IsNonTerminal()) {
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(next_nonterminal_index);
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int>& word_alignments = prev_state->GetWordAlignmentVector();
+ source_word_mid_idx = word_alignments.front(); // The first word on the right of our word
+ resolvedIndexis = true;
+ break;
+ }
+ word_al = alignments.GetAlignmentsForTarget(i + j);
+ if (!word_al.empty()) {
+ break;
+ }
+ }
+
+ if ((i - j) >= 0) {
+ if (targetPhrase.GetWord(i - j).IsNonTerminal()) {
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(next_nonterminal_index - 1); //We need to look at the nonterm on the left.
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int>& word_alignments = prev_state->GetWordAlignmentVector();
+ source_word_mid_idx = word_alignments.back(); // The first word on the left of our word
+ resolvedIndexis = true;
+ break;
+ }
+
+ word_al = alignments.GetAlignmentsForTarget(i - j);
+ if (!word_al.empty()) {
+ break;
+ }
+ }
+ }
+
+ if (!resolvedIndexis){
+ size_t source_center_index = selectMiddleAlignment(word_al);
+ // We have found the alignment. Now determine how much to shift by to get the actual source word index.
+ int nonterm_length = 0; //@TODO Sometimes we have an alignment like a X b -> alpha beta X. In this case
+ //The length of the source phrase that the nonterminal covers doesn't influence the offset of b.
+ //However in cases such as a X b -> alpha X beta, it does. We have to determine how many nonterminals
+ //are before b and add their source span to the source_word_mid_idx.
+ source_word_mid_idx = source_phrase_start_pos + (int)source_center_index + nonterm_length;
+ }
+ word_alignemnts.push_back(source_word_mid_idx);
+ }
+ }
+
+}
+
+size_t BilingualLM::getStateChart(std::vector<int>& neuralLMids) const {
+ size_t hashCode = 0;
+ for (int i = neuralLMids.size() - target_ngrams; i < neuralLMids.size(); i++){
+ int neuralLM_wordID;
+ if (i < 0) {
+ neuralLM_wordID = getNeuralLMId(BOS_word, false);
+ } else {
+ neuralLM_wordID = neuralLMids[i];
+ }
+ boost::hash_combine(hashCode, neuralLM_wordID);
+ }
+ return hashCode;
+}
+
+void BilingualLM::getTargetWordsChart(
+ std::vector<int>& neuralLMids,
+ int current_word_index,
+ std::vector<int>& words,
+ bool sentence_begin) const {
+
+ for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) {
+ if (i < 0) {
+ if (sentence_begin) {
+ words.push_back(getNeuralLMId(BOS_word, false));
+ } else {
+ words.push_back(getNeuralLMId(getNullWord(), false));
+ }
+ } else {
+ words.push_back(neuralLMids[i]);
+ }
+ }
+}
+
+void BilingualLM::appendSourceWordsToVector(const Sentence &source_sent, std::vector<int> &words, int source_word_mid_idx) const {
+ //Define begin and end indexes of the lookup. Cases for even and odd ngrams
+ //This can result in indexes which span larger than the length of the source phrase.
+ //In this case we just
+ int begin_idx;
+ int end_idx;
+
+ if (source_ngrams % 2 == 0) {
+ begin_idx = source_word_mid_idx - source_ngrams / 2 + 1;
+ end_idx = source_word_mid_idx + source_ngrams / 2;
+ } else {
+ begin_idx = source_word_mid_idx - (source_ngrams - 1) / 2;
+ end_idx = source_word_mid_idx + (source_ngrams - 1) / 2;
+ }
+
+ //Add words to vector
+ for (int j = begin_idx; j <= end_idx; j++) {
+ int neuralLM_wordID;
+ if (j < 0) {
+ neuralLM_wordID = getNeuralLMId(BOS_word, true);
+ } else if (j >= source_sent.GetSize()) {
+ neuralLM_wordID = getNeuralLMId(EOS_word, true);
+ } else {
+ const Word& word = source_sent.GetWord(j);
+ neuralLM_wordID = getNeuralLMId(word, true);
+ }
+ words.push_back(neuralLM_wordID);
+ }
+}
+
+FFState* BilingualLM::EvaluateWhenApplied(
+ const ChartHypothesis& cur_hypo,
+ int featureID, /* - used to index the state in the previous hypotheses */
+ ScoreComponentCollection* accumulator) const {
+ //Init vectors
+ std::vector<int> source_words;
+ source_words.reserve(source_ngrams);
+ std::vector<int> target_words;
+ target_words.reserve(target_ngrams);
+
+ float value = 0; //NeuralLM score
+ const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();
+
+ std::vector<int> neuralLMids; //Equivalent more or less to whole_phrase. Contains all word ids but not as expensive
+ std::vector<int> alignments;
+ //Estimate size and reserve vectors to avoid realocation
+ int future_size = currTargetPhrase.GetNumTerminals();
+ for (int i =0; i<currTargetPhrase.GetNumNonTerminals(); i++){
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(i); //We need to look at the nonterm on the left.
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int>& wordIds = prev_state->GetWordIdsVector();
+ future_size += wordIds.size();
+ }
+ neuralLMids.reserve(future_size);
+ neuralLMids.reserve(future_size);
+
+ getAllTargetIdsChart(cur_hypo, featureID, neuralLMids);
+ getAllAlignments(cur_hypo, featureID, alignments);
+
+ bool sentence_begin = false; //Check if this hypothesis' target words are located in the beginning of the sentence
+ if (neuralLMids[0] == getNeuralLMId(BOS_word, true)){
+ sentence_begin = true;
+ }
+
+ //Get source sentence
+ const ChartManager& manager = cur_hypo.GetManager();
+ const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());
+
+ for (int i = 0; i < neuralLMids.size(); i++) { //This loop should be bigger as non terminals expand
+
+ //We already have resolved the nonterminals, we are left with a simple loop.
+ appendSourceWordsToVector(source_sent, source_words, alignments[i]);
+ getTargetWordsChart(neuralLMids, i, target_words, sentence_begin);
+
+ value += Score(source_words, target_words); // Get the score
+
+ //Clear the vectors before the next iteration
+ source_words.clear();
+ target_words.clear();
+
+ }
+ size_t new_state = getStateChart(neuralLMids);
+
+ accumulator->Assign(this, value);
+
+ return new BilingualLMState(new_state, alignments, neuralLMids);
+}
+
+void BilingualLM::SetParameter(const std::string& key, const std::string& value) {
+ if (key == "filepath") {
+ m_filePath = value;
+ } else {
+ StatefulFeatureFunction::SetParameter(key, value);
+ }
+}
+
+} // namespace Moses
+
diff --git a/moses/LM/BilingualLM.h b/moses/LM/BilingualLM.h
new file mode 100644
index 000000000..9f7235956
--- /dev/null
+++ b/moses/LM/BilingualLM.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <string>
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/FF/FFState.h"
+#include <boost/thread/tss.hpp>
+#include "moses/Hypothesis.h"
+#include "moses/ChartHypothesis.h"
+#include "moses/InputPath.h"
+#include "moses/Manager.h"
+#include "moses/ChartManager.h"
+#include "moses/FactorCollection.h"
+
+namespace Moses
+{
+
+class BilingualLMState : public FFState
+{
+ size_t m_hash;
+ std::vector<int> word_alignments; //Carry the word alignments. For hierarchical
+ std::vector<int> neuralLM_ids; //Carry the neuralLMids of the previous target phrase to avoid calling GetWholePhrase. Hiero only.
+public:
+ BilingualLMState(size_t hash)
+ :m_hash(hash)
+ {}
+ BilingualLMState(size_t hash, std::vector<int>& word_alignments_vec, std::vector<int>& neural_ids)
+ :m_hash(hash)
+ , word_alignments(word_alignments_vec)
+ , neuralLM_ids(neural_ids)
+ {}
+
+ const std::vector<int>& GetWordAlignmentVector() const {
+ return word_alignments;
+ }
+
+ const std::vector<int>& GetWordIdsVector() const {
+ return neuralLM_ids;
+ }
+
+ int Compare(const FFState& other) const;
+};
+
+class BilingualLM : public StatefulFeatureFunction {
+ private:
+ virtual float Score(std::vector<int>& source_words, std::vector<int>& target_words) const = 0;
+
+ virtual int getNeuralLMId(const Word& word, bool is_source_word) const = 0;
+
+ virtual void loadModel() = 0;
+
+ virtual const Word& getNullWord() const = 0;
+
+ size_t selectMiddleAlignment(const std::set<size_t>& alignment_links) const;
+
+ void getSourceWords(
+ const TargetPhrase &targetPhrase,
+ int targetWordIdx,
+ const Sentence &source_sent,
+ const WordsRange &sourceWordRange,
+ std::vector<int> &words) const;
+
+ void appendSourceWordsToVector(const Sentence &source_sent, std::vector<int> &words, int source_word_mid_idx) const;
+
+ void getTargetWords(
+ const Hypothesis &cur_hypo,
+ const TargetPhrase &targetPhrase,
+ int current_word_index,
+ std::vector<int> &words) const;
+
+ size_t getState(const Hypothesis &cur_hypo) const;
+
+ void requestPrevTargetNgrams(const Hypothesis &cur_hypo, int amount, std::vector<int> &words) const;
+
+ //Chart decoder
+ void getTargetWordsChart(
+ std::vector<int>& neuralLMids,
+ int current_word_index,
+ std::vector<int>& words,
+ bool sentence_begin) const;
+
+ size_t getStateChart(std::vector<int>& neuralLMids) const;
+
+ //Get a vector of all target words IDs in the beginning of calculating NeuralLMids for the current phrase.
+ void getAllTargetIdsChart(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& wordIds) const;
+ //Get a vector of all alignments (mid_idx word)
+ void getAllAlignments(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& alignemnts) const;
+
+protected:
+ // big data (vocab, weights, cache) shared among threads
+ std::string m_filePath;
+ int target_ngrams;
+ int source_ngrams;
+
+ //NeuralLM lookup
+ FactorType word_factortype;
+ FactorType pos_factortype;
+ const Factor* BOS_factor;
+ const Factor* EOS_factor;
+ mutable Word BOS_word;
+ mutable Word EOS_word;
+
+public:
+ BilingualLM(const std::string &line);
+
+ bool IsUseable(const FactorMask &mask) const {
+ return true;
+ }
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const {
+ return new BilingualLMState(0);
+ }
+
+ void Load();
+
+ void EvaluateInIsolation(
+ const Phrase &source,
+ const TargetPhrase &targetPhrase,
+ ScoreComponentCollection &scoreBreakdown,
+ ScoreComponentCollection &estimatedFutureScore) const;
+
+ void EvaluateWithSourceContext(
+ const InputType &input,
+ const InputPath &inputPath,
+ const TargetPhrase &targetPhrase,
+ const StackVec *stackVec,
+ ScoreComponentCollection &scoreBreakdown,
+ ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+ FFState* EvaluateWhenApplied(
+ const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
+
+ FFState* EvaluateWhenApplied(
+ const ChartHypothesis& cur_hypo ,
+ int featureID, /* - used to index the state in the previous hypotheses */
+ ScoreComponentCollection* accumulator) const;
+
+ void SetParameter(const std::string& key, const std::string& value);
+};
+
+}
+
diff --git a/moses/LM/Implementation.cpp b/moses/LM/Implementation.cpp
index bd5bd1834..9e6746454 100644
--- a/moses/LM/Implementation.cpp
+++ b/moses/LM/Implementation.cpp
@@ -338,7 +338,15 @@ FFState* LanguageModelImplementation::EvaluateWhenApplied(const ChartHypothesis&
}
// assign combined score to score breakdown
- out->Assign(this, prefixScore + finalizedScore);
+ if (OOVFeatureEnabled()) {
+ vector<float> scores(2);
+ scores[0] = prefixScore + finalizedScore;
+ scores[1] = out->GetScoresForProducer(this)[1];
+ out->Assign(this, scores);
+ }
+ else {
+ out->Assign(this, prefixScore + finalizedScore);
+ }
ret->Set(prefixScore, lmState);
return ret;
diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile
index dbc83d738..3d68d161b 100644
--- a/moses/LM/Jamfile
+++ b/moses/LM/Jamfile
@@ -87,23 +87,26 @@ if $(with-ldhtlm) {
#NPLM
local with-nplm = [ option.get "with-nplm" ] ;
if $(with-nplm) {
- lib neuralLM : : <search>$(with-nplm)/lib <search>$(with-nplm)/lib64 ;
- obj NeuralLMWrapper.o : NeuralLMWrapper.cpp neuralLM ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
- alias nplm : NeuralLMWrapper.o neuralLM : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
- dependencies += nplm ;
+ lib nplm : : <search>$(with-nplm)/lib <search>$(with-nplm)/lib64 ;
+ obj NeuralLMWrapper.o : NeuralLMWrapper.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
+ obj BiLM_NPLM.o : bilingual-lm/BiLM_NPLM.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
+ alias neural : NeuralLMWrapper.o BiLM_NPLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
+ dependencies += neural ;
lmmacros += LM_NEURAL ;
}
-#LBLLM
-local with-lbllm = [ option.get "with-lbllm" ] ;
-if $(with-lbllm) {
- lib lbl : : <search>$(with-lbllm)/lib <search>$(with-lbllm)/lib64 ;
- lib murmurhash : : <search>$(with-lbllm)/lib <search>$(with-lbllm)/lib64 ;
- obj LBLLM.o : oxlm/LBLLM.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/third_party/eigen ;
- obj Mapper.o : oxlm/Mapper.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/third_party/eigen ;
- alias lbllm : LBLLM.o Mapper.o lbl murmurhash /top//boost_filesystem : : : <cxxflags>-std=c++0x <define>LM_LBL ;
- dependencies += lbllm ;
- lmmacros += LM_LBL ;
+#OxLM
+local with-oxlm = [ option.get "with-oxlm" ] ;
+if $(with-oxlm) {
+ lib lbl : : <search>$(with-oxlm)/lib <search>$(with-oxlm)/lib64 ;
+ lib murmurhash : : <search>$(with-oxlm)/lib <search>$(with-oxlm)/lib64 ;
+ obj OxLM.o : oxlm/OxLM.cpp lbl ..//headers : <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
+ obj SourceOxLM.o : oxlm/SourceOxLM.cpp lbl ..//headers : <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
+ obj OxLMMapper.o : oxlm/OxLMMapper.cpp lbl ..//headers : <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
+ obj OxLMParallelMapper.o : oxlm/OxLMParallelMapper.cpp lbl ..//headers : <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
+ alias oxlm : OxLM.o SourceOxLM.o OxLMMapper.o OxLMParallelMapper.o lbl murmurhash /top//boost_filesystem : : : <cxxflags>-std=c++0x <define>LM_OXLM ;
+ dependencies += oxlm ;
+ lmmacros += LM_OXLM ;
}
@@ -130,7 +133,7 @@ obj ORLM.o : ORLM.cpp ..//headers ../TranslationModel/DynSAInclude//dynsa : : :
#Top-level LM library. If you've added a file that doesn't depend on external
#libraries, put it here.
-alias LM : Backward.cpp BackwardLMState.cpp Base.cpp Implementation.cpp Joint.cpp Ken.cpp MultiFactor.cpp Remote.cpp SingleFactor.cpp SkeletonLM.cpp ORLM.o
+alias LM : Backward.cpp BackwardLMState.cpp Base.cpp BilingualLM.cpp Implementation.cpp Joint.cpp Ken.cpp MultiFactor.cpp Remote.cpp SingleFactor.cpp SkeletonLM.cpp ORLM.o
../../lm//kenlm ..//headers $(dependencies) ;
alias macros : : : : <define>$(lmmacros) ;
diff --git a/moses/LM/Ken.cpp b/moses/LM/Ken.cpp
index e69746084..7346be3a3 100644
--- a/moses/LM/Ken.cpp
+++ b/moses/LM/Ken.cpp
@@ -43,6 +43,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/ChartHypothesis.h"
#include "moses/Incremental.h"
#include "moses/UserMessage.h"
+#include "moses/Syntax/SVertex.h"
using namespace std;
@@ -348,6 +349,59 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateWhenApplied(con
float score = ruleScore.Finish();
score = TransformLMScore(score);
+ if (OOVFeatureEnabled()) {
+ std::vector<float> scores(2);
+ scores[0] = score;
+ scores[1] = 0.0;
+ accumulator->Assign(this, scores);
+ }
+ else {
+ accumulator->Assign(this, score);
+ }
+ return newState;
+}
+
+template <class Model> FFState *LanguageModelKen<Model>::EvaluateWhenApplied(const Syntax::SHyperedge& hyperedge, int featureID, ScoreComponentCollection *accumulator) const
+{
+ LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
+ lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
+ const TargetPhrase &target = *hyperedge.translation;
+ const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
+ target.GetAlignNonTerm().GetNonTermIndexMap2();
+
+ const size_t size = target.GetSize();
+ size_t phrasePos = 0;
+ // Special cases for first word.
+ if (size) {
+ const Word &word = target.GetWord(0);
+ if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
+ // Begin of sentence
+ ruleScore.BeginSentence();
+ phrasePos++;
+ } else if (word.IsNonTerminal()) {
+ // Non-terminal is first so we can copy instead of rescoring.
+ const Syntax::SVertex *pred = hyperedge.tail[nonTermIndexMap[phrasePos]];
+ const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(pred->state[featureID])->GetChartState();
+ float prob = UntransformLMScore(pred->best->scoreBreakdown.GetScoresForProducer(this)[0]);
+ ruleScore.BeginNonTerminal(prevState, prob);
+ phrasePos++;
+ }
+ }
+
+ for (; phrasePos < size; phrasePos++) {
+ const Word &word = target.GetWord(phrasePos);
+ if (word.IsNonTerminal()) {
+ const Syntax::SVertex *pred = hyperedge.tail[nonTermIndexMap[phrasePos]];
+ const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(pred->state[featureID])->GetChartState();
+ float prob = UntransformLMScore(pred->best->scoreBreakdown.GetScoresForProducer(this)[0]);
+ ruleScore.NonTerminal(prevState, prob);
+ } else {
+ ruleScore.Terminal(TranslateID(word));
+ }
+ }
+
+ float score = ruleScore.Finish();
+ score = TransformLMScore(score);
accumulator->Assign(this, score);
return newState;
}
diff --git a/moses/LM/Ken.h b/moses/LM/Ken.h
index 2f473b697..a2fdb6013 100644
--- a/moses/LM/Ken.h
+++ b/moses/LM/Ken.h
@@ -59,6 +59,8 @@ public:
virtual FFState *EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
+ virtual FFState *EvaluateWhenApplied(const Syntax::SHyperedge& hyperedge, int featureID, ScoreComponentCollection *accumulator) const;
+
virtual void IncrementalCallback(Incremental::Manager &manager) const;
virtual void ReportHistoryOrder(std::ostream &out,const Phrase &phrase) const;
diff --git a/moses/LM/NeuralLMWrapper.cpp b/moses/LM/NeuralLMWrapper.cpp
index ab7b5400b..95b2bf13b 100644
--- a/moses/LM/NeuralLMWrapper.cpp
+++ b/moses/LM/NeuralLMWrapper.cpp
@@ -32,10 +32,14 @@ void NeuralLMWrapper::Load()
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
- m_neuralLM_shared = new nplm::neuralLM(m_filePath, true);
+ m_neuralLM_shared = new nplm::neuralLM();
+ m_neuralLM_shared->read(m_filePath);
+ m_neuralLM_shared->premultiply();
//TODO: config option?
m_neuralLM_shared->set_cache(1000000);
+ m_unk = m_neuralLM_shared->lookup_word("<unk>");
+
UTIL_THROW_IF2(m_nGramOrder != m_neuralLM_shared->get_order(),
"Wrong order of neuralLM: LM has " << m_neuralLM_shared->get_order() << ", but Moses expects " << m_nGramOrder);
@@ -47,6 +51,8 @@ LMResult NeuralLMWrapper::GetValue(const vector<const Word*> &contextFactor, Sta
if (!m_neuralLM.get()) {
m_neuralLM.reset(new nplm::neuralLM(*m_neuralLM_shared));
+ //TODO: config option?
+ m_neuralLM->set_cache(1000000);
}
size_t hashCode = 0;
@@ -65,7 +71,7 @@ LMResult NeuralLMWrapper::GetValue(const vector<const Word*> &contextFactor, Sta
// Create a new struct to hold the result
LMResult ret;
ret.score = FloorScore(value);
- ret.unknown = false;
+ ret.unknown = (words.back() == m_unk);
(*finalState) = (State*) hashCode;
diff --git a/moses/LM/NeuralLMWrapper.h b/moses/LM/NeuralLMWrapper.h
index 7207605e1..2b80fb303 100644
--- a/moses/LM/NeuralLMWrapper.h
+++ b/moses/LM/NeuralLMWrapper.h
@@ -18,6 +18,7 @@ protected:
nplm::neuralLM *m_neuralLM_shared;
// thread-specific nplm for thread-safety
mutable boost::thread_specific_ptr<nplm::neuralLM> m_neuralLM;
+ int m_unk;
public:
NeuralLMWrapper(const std::string &line);
diff --git a/moses/LM/bilingual-lm/BiLM_NPLM.cpp b/moses/LM/bilingual-lm/BiLM_NPLM.cpp
new file mode 100644
index 000000000..190aade1c
--- /dev/null
+++ b/moses/LM/bilingual-lm/BiLM_NPLM.cpp
@@ -0,0 +1,138 @@
+#include "BiLM_NPLM.h"
+#include "neuralLM.h"
+#include "vocabulary.h"
+
+namespace Moses {
+
+BilingualLM_NPLM::BilingualLM_NPLM(const std::string &line)
+ : BilingualLM(line),
+ premultiply(true),
+ factored(false),
+ neuralLM_cache(1000000) {
+
+ if (!NULL_overwrite) {
+ NULL_string = "<null>"; //Default null value for nplm
+ }
+ FactorCollection& factorFactory = FactorCollection::Instance(); // To add null word.
+ const Factor* NULL_factor = factorFactory.AddFactor(NULL_string);
+ NULL_word.SetFactor(0, NULL_factor);
+ }
+
+float BilingualLM_NPLM::Score(std::vector<int>& source_words, std::vector<int>& target_words) const {
+ source_words.reserve(source_ngrams+target_ngrams+1);
+ source_words.insert( source_words.end(), target_words.begin(), target_words.end() );
+ return FloorScore(m_neuralLM->lookup_ngram(source_words));
+}
+
+const Word& BilingualLM_NPLM::getNullWord() const {
+ return NULL_word;
+}
+
+int BilingualLM_NPLM::getNeuralLMId(const Word& word, bool is_source_word) const {
+ initSharedPointer();
+
+ //Decide if we are doing source or target side first.
+ boost::unordered_map<const Factor*, int> * neuralLMids;
+ int unknown_word_id;
+ if (is_source_word) {
+ neuralLMids = &source_neuralLMids;
+ unknown_word_id = source_unknown_word_id;
+ } else {
+ neuralLMids = &target_neuralLMids;
+ unknown_word_id = target_unknown_word_id;
+ }
+
+ boost::unordered_map<const Factor*, int>::iterator it;
+ const Factor* factor = word.GetFactor(word_factortype);
+
+ it = neuralLMids->find(factor);
+ //If we know the word return immediately
+ if (it != neuralLMids->end()){
+ return it->second;
+ }
+ //If we don't know the word and we aren't factored, return the word.
+ if (!factored) {
+ return unknown_word_id;
+ }
+ //Else try to get a pos_factor
+ const Factor* pos_factor = word.GetFactor(pos_factortype);
+ it = neuralLMids->find(pos_factor);
+ if (it != neuralLMids->end()){
+ return it->second;
+ } else {
+ return unknown_word_id;
+ }
+}
+
+void BilingualLM_NPLM::initSharedPointer() const {
+ if (!m_neuralLM.get()) {
+ m_neuralLM.reset(new nplm::neuralLM(*m_neuralLM_shared));
+ }
+}
+
+void BilingualLM_NPLM::SetParameter(const std::string& key, const std::string& value) {
+ if (key == "target_ngrams") {
+ target_ngrams = Scan<int>(value);
+ } else if (key == "source_ngrams") {
+ source_ngrams = Scan<int>(value);
+ } else if (key == "factored") {
+ factored = Scan<bool>(value);
+ } else if (key == "pos_factor") {
+ pos_factortype = Scan<FactorType>(value);
+ } else if (key == "source_vocab") {
+ source_vocab_path = value;
+ } else if (key == "target_vocab") {
+ target_vocab_path = value;
+ } else if (key == "cache_size") {
+ neuralLM_cache = atoi(value.c_str());
+ } else if (key == "premultiply") {
+ premultiply = Scan<bool>(value);
+ } else if (key == "null_word") {
+ NULL_string = value;
+ NULL_overwrite = true;
+ } else {
+ BilingualLM::SetParameter(key, value);
+ }
+}
+
+void BilingualLM_NPLM::loadModel() {
+ m_neuralLM_shared = new nplm::neuralLM();
+ m_neuralLM_shared->read(m_filePath);
+ if (premultiply) {
+ m_neuralLM_shared->premultiply();
+ }
+
+ int ngram_order = target_ngrams + source_ngrams + 1;
+ UTIL_THROW_IF2(
+ ngram_order != m_neuralLM_shared->get_order(),
+ "Wrong order of neuralLM: LM has " << m_neuralLM_shared->get_order() <<
+ ", but Moses expects " << ngram_order);
+
+ m_neuralLM_shared->set_cache(neuralLM_cache); //Default 1000000
+
+ //Setup factor -> NeuralLMId cache. First target words
+ FactorCollection& factorFactory = FactorCollection::Instance(); //To do the conversion from string to vocabID
+ int wordid_counter = 0;
+ target_unknown_word_id = wordid_counter; //The first word is <unk>
+ std::string raw_word;
+ std::ifstream infile_target(target_vocab_path.c_str());
+ while (infile_target >> raw_word) {
+ const Factor * factor = factorFactory.AddFactor(raw_word);
+ target_neuralLMids.insert(std::make_pair(factor, wordid_counter));
+ wordid_counter++;
+ }
+ infile_target.close();
+ source_unknown_word_id = wordid_counter; //The first word is <unk> from the next file
+
+ //Source words now:
+ std::ifstream infile_source(source_vocab_path.c_str());
+ while (infile_source >> raw_word) {
+ const Factor * factor = factorFactory.AddFactor(raw_word);
+ source_neuralLMids.insert(std::make_pair(factor, wordid_counter));
+ wordid_counter++;
+ }
+ infile_source.close();
+
+}
+
+} // namespace Moses
diff --git a/moses/LM/bilingual-lm/BiLM_NPLM.h b/moses/LM/bilingual-lm/BiLM_NPLM.h
new file mode 100644
index 000000000..9a3167455
--- /dev/null
+++ b/moses/LM/bilingual-lm/BiLM_NPLM.h
@@ -0,0 +1,49 @@
+#include "moses/LM/BilingualLM.h"
+#include <boost/unordered_map.hpp>
+#include <utility> //make_pair
+#include <fstream> //Read vocabulary files
+
+namespace nplm {
+ class neuralLM;
+}
+
+namespace Moses {
+
+class BilingualLM_NPLM : public BilingualLM {
+ public:
+ BilingualLM_NPLM(const std::string &line);
+
+ private:
+ float Score(std::vector<int>& source_words, std::vector<int>& target_words) const;
+
+ int getNeuralLMId(const Word& word, bool is_source_word) const;
+
+ void initSharedPointer() const;
+
+ void loadModel();
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ const Word& getNullWord() const;
+
+ nplm::neuralLM *m_neuralLM_shared;
+ mutable boost::thread_specific_ptr<nplm::neuralLM> m_neuralLM;
+
+ mutable boost::unordered_map<const Factor*, int> target_neuralLMids;
+ mutable boost::unordered_map<const Factor*, int> source_neuralLMids;
+
+ //const Factor* NULL_factor_overwrite;
+ std::string NULL_string;
+ bool NULL_overwrite;
+ Word NULL_word;
+
+ std::string source_vocab_path;
+ std::string target_vocab_path;
+ bool premultiply;
+ bool factored;
+ int neuralLM_cache;
+ int source_unknown_word_id;
+ int target_unknown_word_id;
+};
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/Mapper.cpp b/moses/LM/oxlm/Mapper.cpp
deleted file mode 100644
index f1363ccf0..000000000
--- a/moses/LM/oxlm/Mapper.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-#include "Mapper.h"
-#include "moses/FactorCollection.h"
-
-using namespace std;
-
-namespace Moses
-{
-OXLMMapper::OXLMMapper(const oxlm::Dict& dict) : dict(dict)
-{
- for (int i = 0; i < dict.size(); ++i) {
- const string &str = dict.Convert(i);
- FactorCollection &fc = FactorCollection::Instance();
- const Moses::Factor *factor = fc.AddFactor(str, false);
- moses2lbl[factor] = i;
-
- //add(i, TD::Convert());
- }
-
- kUNKNOWN = this->dict.Convert("<unk>");
-}
-
-int OXLMMapper::convert(const Moses::Factor *factor) const
-{
- Coll::const_iterator iter;
- iter = moses2lbl.find(factor);
- if (iter == moses2lbl.end()) {
- return kUNKNOWN;
- }
- else {
- int ret = iter->second;
- return ret;
- }
-}
-
-std::vector<int> OXLMMapper::convert(const Phrase &phrase) const
-{
- size_t size = phrase.GetSize();
- vector<int> ret(size);
-
- for (size_t i = 0; i < size; ++i) {
- const Moses::Factor *factor = phrase.GetFactor(i, 0);
- int id = convert(factor);
- ret[i] = id;
- }
- return ret;
-}
-
-void OXLMMapper::convert(const std::vector<const Word*> &contextFactor, std::vector<int> &ids, int &word) const
-{
- size_t size = contextFactor.size();
-
- ids.resize(size - 1);
-
- for (size_t i = 0; i < size - 1; ++i) {
- const Moses::Factor *factor = contextFactor[i]->GetFactor(0);
- int id = convert(factor);
- ids[i] = id;
- }
- std::reverse(ids.begin(), ids.end());
-
- const Moses::Factor *factor = contextFactor.back()->GetFactor(0);
- word = convert(factor);
-
-}
-
-} // namespace
-
diff --git a/moses/LM/oxlm/Mapper.h b/moses/LM/oxlm/Mapper.h
deleted file mode 100644
index 79cbf7b5f..000000000
--- a/moses/LM/oxlm/Mapper.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#pragma once
-
-#include <map>
-#include "corpus/corpus.h"
-#include "moses/Factor.h"
-#include "moses/Phrase.h"
-
-namespace Moses
-{
-class OXLMMapper
-{
-public:
- OXLMMapper(const oxlm::Dict& dict);
-
- int convert(const Moses::Factor *factor) const;
- std::vector<int> convert(const Phrase &phrase) const;
- void convert(const std::vector<const Word*> &contextFactor, std::vector<int> &ids, int &word) const;
-
-private:
- void add(int lbl_id, int cdec_id);
-
- oxlm::Dict dict;
- typedef std::map<const Moses::Factor*, int> Coll;
- Coll moses2lbl;
- int kUNKNOWN;
-
-};
-
-/**
- * Wraps the feature values computed from the LBL language model.
- */
-struct LBLFeatures {
- LBLFeatures() : LMScore(0), OOVScore(0) {}
- LBLFeatures(double lm_score, double oov_score)
- : LMScore(lm_score), OOVScore(oov_score) {}
- LBLFeatures& operator+=(const LBLFeatures& other) {
- LMScore += other.LMScore;
- OOVScore += other.OOVScore;
- return *this;
- }
-
- double LMScore;
- double OOVScore;
-};
-
-}
diff --git a/moses/LM/oxlm/LBLLM.cpp b/moses/LM/oxlm/OxLM.cpp
index 1bd9d768e..d5c380b71 100644
--- a/moses/LM/oxlm/LBLLM.cpp
+++ b/moses/LM/oxlm/OxLM.cpp
@@ -1,4 +1,4 @@
-#include "LBLLM.h"
+#include "OxLM.h"
#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
@@ -15,7 +15,10 @@ namespace Moses
{
template<class Model>
-LBLLM<Model>::LBLLM(const string &line) : LanguageModelSingleFactor(line) {
+OxLM<Model>::OxLM(const string &line)
+ : LanguageModelSingleFactor(line), normalized(true),
+ posBackOff(false), posFactorType(1),
+ persistentCache(false) {
ReadParameters();
FactorCollection &factorCollection = FactorCollection::Instance();
@@ -32,7 +35,7 @@ LBLLM<Model>::LBLLM(const string &line) : LanguageModelSingleFactor(line) {
template<class Model>
-LBLLM<Model>::~LBLLM() {
+OxLM<Model>::~OxLM() {
if (persistentCache) {
double cache_hit_ratio = 100.0 * cacheHits / totalHits;
cerr << "Cache hit ratio: " << cache_hit_ratio << endl;
@@ -41,33 +44,50 @@ LBLLM<Model>::~LBLLM() {
template<class Model>
-void LBLLM<Model>::SetParameter(const string& key, const string& value) {
- if (key == "persistent-cache") {
+void OxLM<Model>::SetParameter(const string& key, const string& value) {
+ if (key == "normalized") {
+ normalized = Scan<bool>(value);
+ } else if (key == "persistent-cache") {
persistentCache = Scan<bool>(value);
+ } else if (key == "normalized") {
+ normalized = Scan<bool>(value);
+ } else if (key == "pos-back-off") {
+ posBackOff = Scan<bool>(value);
+ } else if (key == "pos-factor-type") {
+ posFactorType = Scan<FactorType>(value);
} else {
LanguageModelSingleFactor::SetParameter(key, value);
}
}
template<class Model>
-void LBLLM<Model>::Load() {
+void OxLM<Model>::Load() {
model.load(m_filePath);
- Dict dict = model.getDict();
- mapper = boost::make_shared<OXLMMapper>(dict);
+ boost::shared_ptr<Vocabulary> vocab = model.getVocab();
+ mapper = boost::make_shared<OxLMMapper>(vocab, posBackOff, posFactorType);
- kSTART = dict.Convert("<s>");
- kSTOP = dict.Convert("</s>");
- kUNKNOWN = dict.Convert("<unk>");
+ kSTART = vocab->convert("<s>");
+ kSTOP = vocab->convert("</s>");
+ kUNKNOWN = vocab->convert("<unk>");
size_t ngram_order = model.getConfig()->ngram_order;
UTIL_THROW_IF2(
m_nGramOrder != ngram_order,
- "Wrong order for LBLLM: LM has " << ngram_order << ", but Moses expects " << m_nGramOrder);
+ "Wrong order for OxLM: LM has " << ngram_order << ", but Moses expects " << m_nGramOrder);
}
template<class Model>
-LMResult LBLLM<Model>::GetValue(
+double OxLM<Model>::GetScore(int word, const vector<int>& context) const {
+ if (normalized) {
+ return model.getLogProb(word, context);
+ } else {
+ return model.getUnnormalizedScore(word, context);
+ }
+}
+
+template<class Model>
+LMResult OxLM<Model>::GetValue(
const vector<const Word*> &contextFactor, State* finalState) const {
if (!cache.get()) {
cache.reset(new QueryCache());
@@ -78,14 +98,12 @@ LMResult LBLLM<Model>::GetValue(
mapper->convert(contextFactor, context, word);
size_t context_width = m_nGramOrder - 1;
-
if (!context.empty() && context.back() == kSTART) {
context.resize(context_width, kSTART);
} else {
context.resize(context_width, kUNKNOWN);
}
-
double score;
if (persistentCache) {
++totalHits;
@@ -95,11 +113,11 @@ LMResult LBLLM<Model>::GetValue(
score = ret.first;
++cacheHits;
} else {
- score = model.predict(word, context);
+ score = GetScore(word, context);
cache->put(query, score);
}
} else {
- score = model.predict(word, context);
+ score = GetScore(word, context);
}
LMResult ret;
@@ -119,7 +137,7 @@ LMResult LBLLM<Model>::GetValue(
}
template<class Model>
-void LBLLM<Model>::InitializeForInput(const InputType& source) {
+void OxLM<Model>::InitializeForInput(const InputType& source) {
LanguageModelSingleFactor::InitializeForInput(source);
if (persistentCache) {
@@ -137,13 +155,14 @@ void LBLLM<Model>::InitializeForInput(const InputType& source) {
cerr << "Done loading " << cache->size()
<< " n-gram probabilities..." << endl;
} else {
- cerr << "Cache file not found" << endl;
+ cerr << "Cache file not found!" << endl;
}
}
}
template<class Model>
-void LBLLM<Model>::CleanUpAfterSentenceProcessing(const InputType& source) {
+void OxLM<Model>::CleanUpAfterSentenceProcessing(const InputType& source) {
+ // Thread safe: the model cache is thread specific.
model.clearCache();
if (persistentCache) {
@@ -162,9 +181,9 @@ void LBLLM<Model>::CleanUpAfterSentenceProcessing(const InputType& source) {
LanguageModelSingleFactor::CleanUpAfterSentenceProcessing(source);
}
-template class LBLLM<LM>;
-template class LBLLM<FactoredLM>;
-template class LBLLM<FactoredMaxentLM>;
+template class OxLM<LM>;
+template class OxLM<FactoredLM>;
+template class OxLM<FactoredMaxentLM>;
}
diff --git a/moses/LM/oxlm/LBLLM.h b/moses/LM/oxlm/OxLM.h
index 67759a8bd..3ddc3058a 100644
--- a/moses/LM/oxlm/LBLLM.h
+++ b/moses/LM/oxlm/OxLM.h
@@ -6,23 +6,19 @@
#include "moses/LM/SingleFactor.h"
// lbl stuff
-#include "corpus/corpus.h"
#include "lbl/model.h"
#include "lbl/query_cache.h"
-#include "Mapper.h"
-
-namespace Moses
-{
+#include "OxLMMapper.h"
+namespace Moses {
template<class Model>
-class LBLLM : public LanguageModelSingleFactor
-{
-public:
- LBLLM(const std::string &line);
+class OxLM : public LanguageModelSingleFactor {
+ public:
+ OxLM(const std::string &line);
- ~LBLLM();
+ ~OxLM();
void SetParameter(const std::string& key, const std::string& value);
@@ -36,18 +32,25 @@ public:
virtual void CleanUpAfterSentenceProcessing(const InputType& source);
-protected:
+ private:
+ double GetScore(int word, const vector<int>& context) const;
+
+ protected:
Model model;
- boost::shared_ptr<OXLMMapper> mapper;
+ boost::shared_ptr<OxLMMapper> mapper;
int kSTART;
int kSTOP;
int kUNKNOWN;
+ bool normalized;
+
+ bool posBackOff;
+ FactorType posFactorType;
+
bool persistentCache;
mutable boost::thread_specific_ptr<oxlm::QueryCache> cache;
mutable int cacheHits, totalHits;
};
-
-}
+} // namespace Moses
diff --git a/moses/LM/oxlm/OxLMMapper.cpp b/moses/LM/oxlm/OxLMMapper.cpp
new file mode 100644
index 000000000..f2953b4e9
--- /dev/null
+++ b/moses/LM/oxlm/OxLMMapper.cpp
@@ -0,0 +1,47 @@
+#include "moses/LM/oxlm/OxLMMapper.h"
+
+#include "moses/FactorCollection.h"
+
+using namespace std;
+
+namespace Moses {
+
+OxLMMapper::OxLMMapper(
+ const boost::shared_ptr<oxlm::Vocabulary>& vocab,
+ bool pos_back_off,
+ const FactorType& pos_factor_type)
+ : posBackOff(pos_back_off), posFactorType(pos_factor_type) {
+ for (int i = 0; i < vocab->size(); ++i) {
+ const string &str = vocab->convert(i);
+ FactorCollection &fc = FactorCollection::Instance();
+ const Moses::Factor *factor = fc.AddFactor(str, false);
+ moses2Oxlm[factor] = i;
+ }
+
+ kUNKNOWN = vocab->convert("<unk>");
+}
+
+int OxLMMapper::convert(const Word& word) const {
+ const Moses::Factor* word_factor = word.GetFactor(0);
+ Coll::const_iterator iter = moses2Oxlm.find(word_factor);
+ if (posBackOff && iter == moses2Oxlm.end()) {
+ const Moses::Factor* pos_factor = word.GetFactor(posFactorType);
+ iter = moses2Oxlm.find(pos_factor);
+ }
+
+ return iter == moses2Oxlm.end() ? kUNKNOWN : iter->second;
+}
+
+void OxLMMapper::convert(
+ const vector<const Word*>& contextFactor,
+ vector<int> &ids, int &word) const {
+ ids.clear();
+ for (size_t i = 0; i < contextFactor.size() - 1; ++i) {
+ ids.push_back(convert(*contextFactor[i]));
+ }
+ std::reverse(ids.begin(), ids.end());
+
+ word = convert(*contextFactor.back());
+}
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/OxLMMapper.h b/moses/LM/oxlm/OxLMMapper.h
new file mode 100644
index 000000000..1aef7af88
--- /dev/null
+++ b/moses/LM/oxlm/OxLMMapper.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <map>
+
+#include "lbl/vocabulary.h"
+
+#include "moses/Factor.h"
+#include "moses/Phrase.h"
+
+namespace Moses {
+
+class OxLMMapper {
+ public:
+ OxLMMapper(
+ const boost::shared_ptr<oxlm::Vocabulary>& vocab,
+ bool pos_back_off,
+ const FactorType& pos_factor_type);
+
+ int convert(const Word& word) const;
+
+ void convert(
+ const std::vector<const Word*> &contextFactor,
+ std::vector<int> &ids,
+ int &word) const;
+
+ protected:
+ bool posBackOff;
+ FactorType posFactorType;
+
+ typedef std::map<const Moses::Factor*, int> Coll;
+ Coll moses2Oxlm;
+ int kUNKNOWN;
+};
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/OxLMParallelMapper.cpp b/moses/LM/oxlm/OxLMParallelMapper.cpp
new file mode 100644
index 000000000..3bfd4be04
--- /dev/null
+++ b/moses/LM/oxlm/OxLMParallelMapper.cpp
@@ -0,0 +1,40 @@
+#include "moses/LM/oxlm/OxLMParallelMapper.h"
+
+#include "lbl/parallel_vocabulary.h"
+
+#include "moses/FactorCollection.h"
+
+using namespace std;
+
+namespace Moses {
+
+OxLMParallelMapper::OxLMParallelMapper(
+ const boost::shared_ptr<oxlm::Vocabulary>& vocab,
+ bool pos_back_off,
+ const FactorType& pos_factor_type)
+ : OxLMMapper(vocab, pos_back_off, pos_factor_type) {
+ boost::shared_ptr<oxlm::ParallelVocabulary> parallel_vocab =
+ dynamic_pointer_cast<oxlm::ParallelVocabulary>(vocab);
+ assert(parallel_vocab != nullptr);
+
+ for (int i = 0; i < parallel_vocab->sourceSize(); ++i) {
+ string word = parallel_vocab->convertSource(i);
+ FactorCollection& fc = FactorCollection::Instance();
+ const Moses::Factor* factor = fc.AddFactor(word, false);
+ moses2SourceOxlm[factor] = i;
+ }
+
+ kSOURCE_UNKNOWN = parallel_vocab->convertSource("<unk>");
+}
+
+int OxLMParallelMapper::convertSource(const Word& word) const {
+ const Moses::Factor* word_factor = word.GetFactor(0);
+ Coll::const_iterator iter = moses2SourceOxlm.find(word_factor);
+ if (posBackOff && iter == moses2SourceOxlm.end()) {
+ const Moses::Factor* pos_factor = word.GetFactor(posFactorType);
+ iter = moses2SourceOxlm.find(pos_factor);
+ }
+ return iter == moses2SourceOxlm.end() ? kSOURCE_UNKNOWN : iter->second;
+}
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/OxLMParallelMapper.h b/moses/LM/oxlm/OxLMParallelMapper.h
new file mode 100644
index 000000000..9fbcfa2a3
--- /dev/null
+++ b/moses/LM/oxlm/OxLMParallelMapper.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "moses/LM/oxlm/OxLMMapper.h"
+
+namespace Moses {
+
+class OxLMParallelMapper : public OxLMMapper {
+ public:
+ OxLMParallelMapper(
+ const boost::shared_ptr<oxlm::Vocabulary>& vocab,
+ bool pos_back_off,
+ const FactorType& pos_factor_type);
+
+ int convertSource(const Word& word) const;
+
+ private:
+ Coll moses2SourceOxlm;
+ int kSOURCE_UNKNOWN;
+};
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/SourceOxLM.cpp b/moses/LM/oxlm/SourceOxLM.cpp
new file mode 100644
index 000000000..4a6991eb2
--- /dev/null
+++ b/moses/LM/oxlm/SourceOxLM.cpp
@@ -0,0 +1,137 @@
+#include "moses/LM/oxlm/SourceOxLM.h"
+
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/filesystem.hpp>
+
+using namespace std;
+using namespace oxlm;
+
+namespace Moses {
+
+SourceOxLM::SourceOxLM(const string &line)
+ : BilingualLM(line), posBackOff(false), posFactorType(1),
+ persistentCache(false), cacheHits(0), totalHits(0) {
+ FactorCollection& factorFactory = FactorCollection::Instance(); // To add null word.
+ const Factor* NULL_factor = factorFactory.AddFactor("<unk>");
+ NULL_word.SetFactor(0, NULL_factor);
+ }
+
+SourceOxLM::~SourceOxLM() {
+ if (persistentCache) {
+ double cache_hit_ratio = 100.0 * cacheHits / totalHits;
+ cerr << "Cache hit ratio: " << cache_hit_ratio << endl;
+ }
+}
+
+float SourceOxLM::Score(
+ vector<int>& source_words,
+ vector<int>& target_words) const {
+ // OxLM expects the context in the following format:
+ // [t_{n-1}, t_{n-2}, ..., t_{n-m}, s_{a_n-sm}, s_{a_n-sm+1}, ..., s_{a_n+sm}]
+ // where n is the index for the current target word, m is the target order,
+ // a_n is t_n's affiliation and sm is the source order.
+ vector<int> context = target_words;
+ int word = context.back();
+ context.pop_back();
+ reverse(context.begin(), context.end());
+ context.insert(context.end(), source_words.begin(), source_words.end());
+
+ float score;
+ if (persistentCache) {
+ if (!cache.get()) {
+ cache.reset(new QueryCache());
+ }
+
+ ++totalHits;
+ NGram query(word, context);
+ pair<double, bool> ret = cache->get(query);
+ if (ret.second) {
+ score = ret.first;
+ ++cacheHits;
+ } else {
+ score = model.getLogProb(word, context);
+ cache->put(query, score);
+ }
+ } else {
+ score = model.getLogProb(word, context);
+ }
+
+ // TODO(pauldb): Return OOV count too.
+ return score;
+}
+
+int SourceOxLM::getNeuralLMId(const Word& word, bool is_source_word) const {
+ return is_source_word ? mapper->convertSource(word) : mapper->convert(word);
+}
+
+const Word& SourceOxLM::getNullWord() const {
+ return NULL_word;
+}
+
+void SourceOxLM::loadModel() {
+ model.load(m_filePath);
+
+ boost::shared_ptr<ModelData> config = model.getConfig();
+ source_ngrams = 2 * config->source_order - 1;
+ target_ngrams = config->ngram_order - 1;
+
+ boost::shared_ptr<Vocabulary> vocab = model.getVocab();
+ mapper = boost::make_shared<OxLMParallelMapper>(
+ vocab, posBackOff, posFactorType);
+}
+
+void SourceOxLM::SetParameter(const string& key, const string& value) {
+ if (key == "persistent-cache") {
+ persistentCache = Scan<bool>(value);
+ } else if (key == "pos-back-off") {
+ posBackOff = Scan<bool>(value);
+ } else if (key == "pos-factor-type") {
+ posFactorType = Scan<FactorType>(value);
+ } else {
+ BilingualLM::SetParameter(key, value);
+ }
+}
+
+void SourceOxLM::InitializeForInput(const InputType& source) {
+ BilingualLM::InitializeForInput(source);
+
+ if (persistentCache) {
+ if (!cache.get()) {
+ cache.reset(new QueryCache());
+ }
+
+ int sentence_id = source.GetTranslationId();
+ string cacheFile = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
+ if (boost::filesystem::exists(cacheFile)) {
+ ifstream fin(cacheFile);
+ boost::archive::binary_iarchive iar(fin);
+ cerr << "Loading n-gram probability cache from " << cacheFile << endl;
+ iar >> *cache;
+ cerr << "Done loading " << cache->size()
+ << " n-gram probabilities..." << endl;
+ } else {
+ cerr << "Cache file not found!" << endl;
+ }
+ }
+}
+
+void SourceOxLM::CleanUpAfterSentenceProcessing(const InputType& source) {
+ // Thread safe: the model cache is thread specific.
+ model.clearCache();
+
+ if (persistentCache) {
+ int sentence_id = source.GetTranslationId();
+ string cacheFile = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
+ ofstream fout(cacheFile);
+ boost::archive::binary_oarchive oar(fout);
+ cerr << "Saving persistent cache to " << cacheFile << endl;
+ oar << *cache;
+ cerr << "Done saving " << cache->size()
+ << " n-gram probabilities..." << endl;
+
+ cache->clear();
+ }
+}
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/SourceOxLM.h b/moses/LM/oxlm/SourceOxLM.h
new file mode 100644
index 000000000..3af48489f
--- /dev/null
+++ b/moses/LM/oxlm/SourceOxLM.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <vector>
+
+#include "lbl/model.h"
+#include "lbl/query_cache.h"
+
+#include "moses/LM/BilingualLM.h"
+#include "moses/LM/oxlm/OxLMParallelMapper.h"
+
+namespace Moses {
+
+class SourceOxLM : public BilingualLM {
+ public:
+ SourceOxLM(const std::string &line);
+
+ ~SourceOxLM();
+
+ private:
+ virtual float Score(
+ std::vector<int>& source_words,
+ std::vector<int>& target_words) const;
+
+ virtual int getNeuralLMId(const Word& word, bool is_source_word) const;
+
+ virtual void loadModel();
+
+ const Word& getNullWord() const;
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ void InitializeForInput(const InputType& source);
+
+ void CleanUpAfterSentenceProcessing(const InputType& source);
+
+ protected:
+ oxlm::SourceFactoredLM model;
+ boost::shared_ptr<OxLMParallelMapper> mapper;
+
+ bool posBackOff;
+ FactorType posFactorType;
+
+ bool persistentCache;
+ mutable boost::thread_specific_ptr<oxlm::QueryCache> cache;
+ mutable int cacheHits, totalHits;
+ Word NULL_word; //Null symbol for hiero
+};
+
+} // namespace Moses
diff --git a/moses/LatticeMBR.cpp b/moses/LatticeMBR.cpp
index 148b44743..9ea21d5db 100644
--- a/moses/LatticeMBR.cpp
+++ b/moses/LatticeMBR.cpp
@@ -13,9 +13,8 @@
#include <set>
using namespace std;
-using namespace Moses;
-namespace MosesCmd
+namespace Moses
{
size_t bleu_order = 4;
diff --git a/moses/LatticeMBR.h b/moses/LatticeMBR.h
index ab8b3cb76..47d6da3c4 100644
--- a/moses/LatticeMBR.h
+++ b/moses/LatticeMBR.h
@@ -19,7 +19,7 @@
-namespace MosesCmd
+namespace Moses
{
class Edge;
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 5ebd0b9c4..7a27dcaaf 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -54,12 +54,11 @@ using namespace std;
namespace Moses
{
-Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm)
+Manager::Manager(InputType const& source, SearchAlgorithm searchAlgorithm)
:m_transOptColl(source.CreateTranslationOptionCollection())
,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
,interrupted_flag(0)
,m_hypoId(0)
- ,m_lineNumber(lineNumber)
,m_source(source)
{
StaticData::Instance().InitializeForInput(m_source);
@@ -105,7 +104,7 @@ void Manager::ProcessSentence()
// some reporting on how long this took
IFVERBOSE(1) {
GetSentenceStats().StopTimeCollectOpts();
- TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took "
+ TRACE_ERR("Line "<< m_source.GetTranslationId() << ": Collecting options took "
<< GetSentenceStats().GetTimeCollectOpts() << " seconds at "
<< __FILE__ << ":" << __LINE__ << endl);
}
@@ -114,7 +113,7 @@ void Manager::ProcessSentence()
Timer searchTime;
searchTime.start();
m_search->ProcessSentence();
- VERBOSE(1, "Line " << m_lineNumber << ": Search took " << searchTime << " seconds" << endl);
+ VERBOSE(1, "Line " << m_source.GetTranslationId() << ": Search took " << searchTime << " seconds" << endl);
IFVERBOSE(2) {
GetSentenceStats().StopTimeTotal();
TRACE_ERR(GetSentenceStats());
@@ -831,7 +830,7 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
void Manager::OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const
{
- VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << m_lineNumber << std::endl)
+ VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << m_source.GetTranslationId() << std::endl)
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
@@ -842,7 +841,7 @@ void Manager::OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStrea
set<int> terminalNodes;
multimap<int,int> hypergraphIDToArcs;
- VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << m_lineNumber << std::endl)
+ VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << m_source.GetTranslationId() << std::endl)
long numNodes = 0;
long endNode = 0;
@@ -904,15 +903,15 @@ void Manager::OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStrea
// Print number of nodes and arcs
outputSearchGraphStream << numNodes << " " << numArcs << endl;
- VERBOSE(2,"Search graph to output as hypergraph for sentence " << m_lineNumber
+ VERBOSE(2,"Search graph to output as hypergraph for sentence " << m_source.GetTranslationId()
<< " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
- VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << m_lineNumber << std::endl)
+ VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << m_source.GetTranslationId() << std::endl)
for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
if (hypergraphHypothesisID % 100000 == 0) {
- VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << m_lineNumber << std::endl);
+ VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << m_source.GetTranslationId() << std::endl);
}
// int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
@@ -935,7 +934,7 @@ void Manager::OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStrea
// int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
UTIL_THROW_IF2(
(hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
- "Error while writing search lattice as hypergraph for sentence " << m_lineNumber << ". " <<
+ "Error while writing search lattice as hypergraph for sentence " << m_source.GetTranslationId() << ". " <<
"Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
". There are " << numNodes << " nodes in the search lattice."
@@ -950,7 +949,7 @@ void Manager::OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStrea
// VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
UTIL_THROW_IF2(
(startNode >= hypergraphHypothesisID),
- "Error while writing search lattice as hypergraph for sentence" << m_lineNumber << ". " <<
+ "Error while writing search lattice as hypergraph for sentence" << m_source.GetTranslationId() << ". " <<
"The nodes must be output in topological order. The code attempted to violate this restriction."
);
diff --git a/moses/Manager.h b/moses/Manager.h
index 9512bb472..ef4612de1 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -118,7 +118,6 @@ protected:
size_t interrupted_flag;
std::auto_ptr<SentenceStats> m_sentenceStats;
int m_hypoId; //used to number the hypos as they are created.
- size_t m_lineNumber;
void GetConnectedGraph(
std::map< int, bool >* pConnected,
@@ -130,7 +129,7 @@ protected:
public:
InputType const& m_source; /**< source sentence to be translated */
- Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm);
+ Manager(InputType const& source, SearchAlgorithm searchAlgorithm);
~Manager();
const TranslationOptionCollection* getSntTranslationOptions();
@@ -145,7 +144,7 @@ public:
void GetOutputLanguageModelOrder( std::ostream &out, const Hypothesis *hypo );
void GetWordGraph(long translationId, std::ostream &outputWordGraphStream) const;
int GetNextHypoId();
- size_t GetLineNumber() const {return m_lineNumber;}
+
#ifdef HAVE_PROTOBUF
void SerializeSearchGraphPB(long translationId, std::ostream& outputStream) const;
#endif
diff --git a/moses/MockHypothesis.cpp b/moses/MockHypothesis.cpp
index 3f68bd9a8..c18b58a5e 100644
--- a/moses/MockHypothesis.cpp
+++ b/moses/MockHypothesis.cpp
@@ -41,7 +41,7 @@ MockHypothesisGuard::MockHypothesisGuard(
m_wp("WordPenalty"),
m_uwp("UnknownWordPenalty"),
m_dist("Distortion"),
- m_manager(0,m_sentence,Normal)
+ m_manager(m_sentence,Normal)
{
BOOST_CHECK_EQUAL(alignments.size(), targetSegments.size());
diff --git a/moses/PDTAimp.cpp b/moses/PDTAimp.cpp
index f3f870e1d..34f65da4c 100644
--- a/moses/PDTAimp.cpp
+++ b/moses/PDTAimp.cpp
@@ -10,7 +10,6 @@ PDTAimp::PDTAimp(PhraseDictionaryTreeAdaptor *p)
totalE(0),
distinctE(0) {
m_numInputScores = 0;
- const StaticData &staticData = StaticData::Instance();
m_inputFeature = &InputFeature::Instance();
if (m_inputFeature) {
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 7780b543b..85d28bdb0 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -205,13 +205,23 @@ Parameter::Parameter()
AddParam("placeholder-factor", "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model");
AddParam("no-cache", "Disable all phrase-table caching. Default = false (ie. enable caching)");
AddParam("default-non-term-for-empty-range-only", "Don't add [X] to all ranges, just ranges where there isn't a source non-term. Default = false (ie. add [X] everywhere)");
+ AddParam("s2t", "Use specialized string-to-tree decoder.");
+ AddParam("s2t-parsing-algorithm", "Which S2T parsing algorithm to use. 0=recursive CYK+, 1=scope-3 (default = 0)");
+ AddParam("spe-src", "Simulated post-editing. Source filename");
+ AddParam("spe-trg", "Simulated post-editing. Target filename");
+ AddParam("spe-aln", "Simulated post-editing. Alignment filename");
}
Parameter::~Parameter()
{
}
+const PARAM_VEC &Parameter::GetParam(const std::string &paramName)
+{
+ return m_setting[paramName];
+}
+
/** initialize a parameter, sub of constructor */
void Parameter::AddParam(const string &paramName, const string &description)
{
diff --git a/moses/Parameter.h b/moses/Parameter.h
index 0eb06cc10..7e5e75496 100644
--- a/moses/Parameter.h
+++ b/moses/Parameter.h
@@ -90,11 +90,10 @@ public:
void Explain();
/** return a vector of strings holding the whitespace-delimited values on the ini-file line corresponding to the given parameter name */
- const PARAM_VEC &GetParam(const std::string &paramName) {
- return m_setting[paramName];
- }
+ const PARAM_VEC &GetParam(const std::string &paramName);
+
/** check if parameter is defined (either in moses.ini or as switch) */
- bool isParamSpecified(const std::string &paramName) {
+ bool isParamSpecified(const std::string &paramName) const {
return m_setting.find( paramName ) != m_setting.end();
}
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index c6be2e36f..bf1db45cf 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -67,6 +67,7 @@ StaticData::StaticData()
,m_isAlwaysCreateDirectTranslationOption(false)
,m_currentWeightSetting("default")
,m_treeStructure(NULL)
+ ,m_useS2TDecoder(false)
{
m_xmlBrackets.first="<";
m_xmlBrackets.second=">";
@@ -433,6 +434,10 @@ bool StaticData::LoadData(Parameter *parameter)
SetBooleanParameter( &m_defaultNonTermOnlyForEmptyRange, "default-non-term-for-empty-range-only", false );
SetBooleanParameter( &m_printNBestTrees, "n-best-trees", false );
+ // S2T decoder
+ SetBooleanParameter( &m_useS2TDecoder, "s2t", false );
+ m_s2tParsingAlgorithm = (m_parameter->GetParam("s2t-parsing-algorithm").size() > 0) ?
+ (S2TParsingAlgorithm) Scan<size_t>(m_parameter->GetParam("s2t-parsing-algorithm")[0]) : RecursiveCYKPlus;
// Compact phrase table and reordering model
SetBooleanParameter( &m_minphrMemory, "minphr-memory", false );
@@ -1155,6 +1160,15 @@ std::map<std::string, std::string> StaticData::OverrideFeatureNames()
}
}
+ if (m_useS2TDecoder) {
+ // Automatically override PhraseDictionary{Memory,Scope3}. This will
+ // have to change if the FF parameters diverge too much in the future,
+ // but for now it makes switching between the old and new decoders much
+ // more convenient.
+ ret["PhraseDictionaryMemory"] = "RuleTable";
+ ret["PhraseDictionaryScope3"] = "RuleTable";
+ }
+
return ret;
}
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 2cd8e82c5..feb6c8c85 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -199,6 +199,8 @@ protected:
FactorType m_placeHolderFactor;
bool m_useLegacyPT;
bool m_defaultNonTermOnlyForEmptyRange;
+ bool m_useS2TDecoder;
+ S2TParsingAlgorithm m_s2tParsingAlgorithm;
bool m_printNBestTrees;
FeatureRegistry m_registry;
@@ -266,6 +268,10 @@ public:
return m_parameter->GetParam(paramName);
}
+ const Parameter &GetParameter() const {
+ return *m_parameter;
+ }
+
const std::vector<FactorType> &GetInputFactorOrder() const {
return m_inputFactorOrder;
}
@@ -767,6 +773,13 @@ public:
bool GetDefaultNonTermOnlyForEmptyRange() const
{ return m_defaultNonTermOnlyForEmptyRange; }
+ bool UseS2TDecoder() const {
+ return m_useS2TDecoder;
+ }
+ S2TParsingAlgorithm GetS2TParsingAlgorithm() const {
+ return m_s2tParsingAlgorithm;
+ }
+
bool PrintNBestTrees() const {
return m_printNBestTrees;
}
diff --git a/moses/Syntax/BoundedPriorityContainer.h b/moses/Syntax/BoundedPriorityContainer.h
new file mode 100644
index 000000000..9afc1b75d
--- /dev/null
+++ b/moses/Syntax/BoundedPriorityContainer.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <queue>
+#include <vector>
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// A container that can hold up to k objects of type T, each with an associated
+// priority. The container accepts new elements unconditionally until the
+// limit is reached. After that, elements are only accepted if they have a
+// higher priority than the worst element (which they displace).
+//
+// BoundedPriorityContainer does not preserve the insertion order of the
+// elements (or provide any other guarantees about order).
+//
+// BoundedPriorityContainer pre-allocates space for all k objects.
+//
+// (Although BoundedPriorityContainer is implemented using a priority queue,
+// it doesn't provide the interface of a priority queue, hence the generic
+// name 'container'.)
+template<typename T>
+class BoundedPriorityContainer
+{
+ public:
+ typedef typename std::vector<T>::iterator Iterator;
+ typedef typename std::vector<T>::const_iterator ConstIterator;
+
+ BoundedPriorityContainer(std::size_t);
+
+ Iterator Begin() { return m_elements.begin(); }
+ Iterator End() { return m_elements.begin()+m_size; }
+
+ ConstIterator Begin() const { return m_elements.begin(); }
+ ConstIterator End() const { return m_elements.begin()+m_size; }
+
+ // Return the number of elements currently held.
+ std::size_t Size() const { return m_size; }
+
+ // 'Lazily' clear the container by setting the size to 0 (allowing elements
+ // to be overwritten).
+ // TODO Eliminate heap-reorganisation overhead by using a vector-based heap
+ // TODO directly instead of priority_queue, which requires pop() to clear
+ // TODO Alternative, is to clear m_queue by assigning an empty queue value
+ // TODO but that might incur an alloc-related overhead when the new underlying
+ // TODO has to be regrown.
+ void LazyClear() { m_size = 0; while (!m_queue.empty()) { m_queue.pop(); } }
+
+ // Insert the given object iff
+ // i) the container is not full yet, or
+ // ii) the new object has a higher priority than the worst one already
+ // stored.
+ // The return value specifies whether or not the element was inserted.
+ bool Insert(const T &, float);
+
+ // Insert the given object iff
+ // i) the container is not full yet, or
+ // ii) the new object has a higher priority than the worst one already
+ // stored.
+ // If the element is inserted then, for efficiency reasons, it is swapped in
+ // rather than copied. This requires that T provides a swap() function. The
+ // return value specifies whether or not the element was inserted.
+ // TODO Test if this is actually any faster than Insert() in practice.
+ bool SwapIn(T &, float);
+
+ // Determine if an object with the given priority would be accepted for
+ // insertion based on the current contents of the container.
+ bool WouldAccept(float priority)
+ {
+ return m_size < m_limit || priority > m_queue.top().first;
+ }
+
+ private:
+ typedef std::pair<float, int> PriorityIndexPair;
+
+ class PriorityIndexPairOrderer
+ {
+ public:
+ bool operator()(const PriorityIndexPair &p,
+ const PriorityIndexPair &q) const {
+ return p.first > q.first;
+ }
+ };
+
+ // Min-priority queue. The queue stores the indices of the elements, not
+ // the elements themselves to keep down the costs of heap maintenance.
+ typedef std::priority_queue<PriorityIndexPair,
+ std::vector<PriorityIndexPair>,
+ PriorityIndexPairOrderer> Queue;
+
+ // The elements are stored in a vector. Note that the size of this vector
+ // can be greater than m_size (after a call to LazyClear).
+ std::vector<T> m_elements;
+
+ // The number of elements currently held.
+ std::size_t m_size;
+
+ // The maximum number of elements.
+ const std::size_t m_limit;
+
+ // The min-priority queue.
+ Queue m_queue;
+};
+
+template<typename T>
+BoundedPriorityContainer<T>::BoundedPriorityContainer(std::size_t limit)
+ : m_size(0)
+ , m_limit(limit)
+{
+ m_elements.reserve(m_limit);
+}
+
+template<typename T>
+bool BoundedPriorityContainer<T>::Insert(const T &t, float priority)
+{
+ if (m_size < m_limit) {
+ PriorityIndexPair pair(priority, m_size);
+ m_queue.push(pair);
+ if (m_size < m_elements.size()) {
+ m_elements[m_size] = t;
+ } else {
+ m_elements.push_back(t);
+ }
+ ++m_size;
+ return true;
+ } else if (priority > m_queue.top().first) {
+ PriorityIndexPair pair = m_queue.top();
+ m_queue.pop();
+ pair.first = priority;
+ m_elements[pair.second] = t;
+ m_queue.push(pair);
+ return true;
+ }
+ return false;
+}
+
+template<typename T>
+bool BoundedPriorityContainer<T>::SwapIn(T &t, float priority)
+{
+ if (m_size < m_limit) {
+ PriorityIndexPair pair(priority, m_size);
+ m_queue.push(pair);
+ if (m_size < m_elements.size()) {
+ swap(m_elements[m_size], t);
+ } else {
+ m_elements.push_back(t);
+ }
+ ++m_size;
+ return true;
+ } else if (priority > m_queue.top().first) {
+ PriorityIndexPair pair = m_queue.top();
+ m_queue.pop();
+ pair.first = priority;
+ swap(m_elements[pair.second], t);
+ m_queue.push(pair);
+ return true;
+ }
+ return false;
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/Cube.cpp b/moses/Syntax/Cube.cpp
new file mode 100644
index 000000000..4fcf50829
--- /dev/null
+++ b/moses/Syntax/Cube.cpp
@@ -0,0 +1,138 @@
+#include "Cube.h"
+
+#include "moses/FF/FFState.h"
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/StaticData.h"
+
+#include "SVertex.h"
+
+namespace Moses
+{
+
+namespace Syntax
+{
+
+Cube::Cube(const SHyperedgeBundle &bundle)
+ : m_bundle(bundle)
+{
+ // Create the SHyperedge for the 'corner' of the cube.
+ std::vector<int> coordinates(bundle.stacks.size()+1, 0);
+ SHyperedge *hyperedge = CreateHyperedge(coordinates);
+ // Add its coordinates to the set of visited coordinates.
+ std::pair<CoordinateSet::iterator, bool> p = m_visited.insert(coordinates);
+ const std::vector<int> &storedCoordinates = *p.first;
+ // Add the SHyperedge to the queue along with its coordinates (which will be
+ // needed for creating its neighbours).
+ m_queue.push(QueueItem(hyperedge, &storedCoordinates));
+}
+
+Cube::~Cube()
+{
+ // Delete the SHyperedges belonging to any unpopped items. Note that the
+ // coordinate vectors are not deleted here since they are owned by m_visited
+ // (and so will be deleted by its destructor).
+ while (!m_queue.empty()) {
+ QueueItem item = m_queue.top();
+ m_queue.pop();
+ // Delete hyperedge and its head (head deletes hyperedge).
+ delete item.first->head; // TODO shared ownership of head vertex?
+ }
+}
+
+SHyperedge *Cube::Pop()
+{
+ QueueItem item = m_queue.top();
+ m_queue.pop();
+ CreateNeighbours(*item.second);
+ return item.first;
+}
+
+void Cube::CreateNeighbours(const std::vector<int> &coordinates)
+{
+ // Create a copy of the origin coordinates that will be adjusted for
+ // each neighbour.
+ std::vector<int> tmpCoordinates(coordinates);
+
+ // Create each neighbour along the vertex stack dimensions.
+ for (std::size_t i = 0; i < coordinates.size()-1; ++i) {
+ int x = coordinates[i];
+ if (m_bundle.stacks[i]->size() > x+1) {
+ ++tmpCoordinates[i];
+ CreateNeighbour(tmpCoordinates);
+ --tmpCoordinates[i];
+ }
+ }
+ // Create the neighbour along the translation dimension.
+ int x = coordinates.back();
+ if (m_bundle.translations->GetSize() > x+1) {
+ ++tmpCoordinates.back();
+ CreateNeighbour(tmpCoordinates);
+ --tmpCoordinates.back();
+ }
+}
+
+void Cube::CreateNeighbour(const std::vector<int> &coordinates)
+{
+ // Add the coordinates to the set of visited coordinates if not already
+ // present.
+ std::pair<CoordinateSet::iterator, bool> p = m_visited.insert(coordinates);
+ if (!p.second) {
+ // We have visited this neighbour before, so there is nothing to do.
+ return;
+ }
+ SHyperedge *hyperedge = CreateHyperedge(coordinates);
+ const std::vector<int> &storedCoordinates = *p.first;
+ m_queue.push(QueueItem(hyperedge, &storedCoordinates));
+}
+
+SHyperedge *Cube::CreateHyperedge(const std::vector<int> &coordinates)
+{
+ SHyperedge *hyperedge = new SHyperedge();
+
+ SVertex *head = new SVertex();
+ head->best = hyperedge;
+ head->pvertex = 0; // FIXME???
+ head->state.resize(
+ StatefulFeatureFunction::GetStatefulFeatureFunctions().size());
+ hyperedge->head = head;
+
+ hyperedge->tail.resize(coordinates.size()-1);
+ for (std::size_t i = 0; i < coordinates.size()-1; ++i) {
+ boost::shared_ptr<SVertex> pred = (*m_bundle.stacks[i])[coordinates[i]];
+ hyperedge->tail[i] = pred.get();
+ if (pred->best) {
+ hyperedge->scoreBreakdown.PlusEquals(pred->best->scoreBreakdown);
+ }
+ }
+ hyperedge->translation = *(m_bundle.translations->begin()+coordinates.back());
+ hyperedge->scoreBreakdown.PlusEquals(hyperedge->translation->GetScoreBreakdown());
+
+ const StaticData &staticData = StaticData::Instance();
+
+ // compute values of stateless feature functions that were not
+ // cached in the translation option-- there is no principled distinction
+ const std::vector<const StatelessFeatureFunction*>& sfs =
+ StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ for (unsigned i = 0; i < sfs.size(); ++i) {
+ if (!staticData.IsFeatureFunctionIgnored(*sfs[i])) {
+ sfs[i]->EvaluateWhenApplied(*hyperedge, &hyperedge->scoreBreakdown);
+ }
+ }
+
+ const std::vector<const StatefulFeatureFunction*>& ffs =
+ StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for (unsigned i = 0; i < ffs.size(); ++i) {
+ if (!staticData.IsFeatureFunctionIgnored(*ffs[i])) {
+ head->state[i] =
+ ffs[i]->EvaluateWhenApplied(*hyperedge, i, &hyperedge->scoreBreakdown);
+ }
+ }
+
+ hyperedge->score = hyperedge->scoreBreakdown.GetWeightedScore();
+
+ return hyperedge;
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/Cube.h b/moses/Syntax/Cube.h
new file mode 100644
index 000000000..a28440834
--- /dev/null
+++ b/moses/Syntax/Cube.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <queue>
+#include <vector>
+#include <utility>
+
+#include <boost/unordered_set.hpp>
+
+#include "SHyperedge.h"
+#include "SHyperedgeBundle.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// A cube -- in the cube pruning sense (see Chiang (2007)) -- that lazily
+// produces SHyperedge objects from a SHyperedgeBundle in approximately
+// best-first order.
+class Cube
+{
+ public:
+ Cube(const SHyperedgeBundle &);
+ ~Cube();
+
+ SHyperedge *Pop();
+
+ SHyperedge *Top() const { return m_queue.top().first; }
+
+ bool IsEmpty() const { return m_queue.empty(); }
+
+ private:
+ typedef boost::unordered_set<std::vector<int> > CoordinateSet;
+
+ typedef std::pair<SHyperedge *, const std::vector<int> *> QueueItem;
+
+ class QueueItemOrderer
+ {
+ public:
+ bool operator()(const QueueItem &p, const QueueItem &q) const {
+ return p.first->score < q.first->score;
+ }
+ };
+
+ typedef std::priority_queue<QueueItem, std::vector<QueueItem>,
+ QueueItemOrderer> Queue;
+
+ SHyperedge *CreateHyperedge(const std::vector<int> &);
+ void CreateNeighbour(const std::vector<int> &);
+ void CreateNeighbours(const std::vector<int> &);
+
+ const SHyperedgeBundle &m_bundle;
+ CoordinateSet m_visited;
+ Queue m_queue;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/CubeQueue.cpp b/moses/Syntax/CubeQueue.cpp
new file mode 100644
index 000000000..5bb8c2a98
--- /dev/null
+++ b/moses/Syntax/CubeQueue.cpp
@@ -0,0 +1,37 @@
+#include "CubeQueue.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+CubeQueue::~CubeQueue()
+{
+ while (!m_queue.empty()) {
+ Cube *cube = m_queue.top();
+ m_queue.pop();
+ delete cube;
+ }
+}
+
+SHyperedge *CubeQueue::Pop()
+{
+ // pop the most promising cube
+ Cube *cube = m_queue.top();
+ m_queue.pop();
+
+ // pop the most promising hyperedge from the cube
+ SHyperedge *hyperedge = cube->Pop();
+
+ // if the cube contains more items then push it back onto the queue
+ if (!cube->IsEmpty()) {
+ m_queue.push(cube);
+ } else {
+ delete cube;
+ }
+
+ return hyperedge;
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/CubeQueue.h b/moses/Syntax/CubeQueue.h
new file mode 100644
index 000000000..304e59409
--- /dev/null
+++ b/moses/Syntax/CubeQueue.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <queue>
+#include <vector>
+
+#include "Cube.h"
+#include "SHyperedge.h"
+#include "SHyperedgeBundle.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+class CubeQueue
+{
+ public:
+ template<typename InputIterator>
+ CubeQueue(InputIterator, InputIterator);
+
+ ~CubeQueue();
+
+ SHyperedge *Pop();
+
+ bool IsEmpty() const { return m_queue.empty(); }
+
+ private:
+ class CubeOrderer
+ {
+ public:
+ bool operator()(const Cube *p, const Cube *q) const {
+ return p->Top()->score < q->Top()->score;
+ }
+ };
+
+ typedef std::priority_queue<Cube*, std::vector<Cube*>, CubeOrderer> Queue;
+
+ Queue m_queue;
+};
+
+template<typename InputIterator>
+CubeQueue::CubeQueue(InputIterator first, InputIterator last)
+{
+ while (first != last) {
+ m_queue.push(new Cube(*first++));
+ }
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/KBestExtractor.cpp b/moses/Syntax/KBestExtractor.cpp
new file mode 100644
index 000000000..335d80409
--- /dev/null
+++ b/moses/Syntax/KBestExtractor.cpp
@@ -0,0 +1,317 @@
+#include "KBestExtractor.h"
+
+#include "moses/ScoreComponentCollection.h"
+#include "moses/StaticData.h"
+
+#include <boost/scoped_ptr.hpp>
+
+#include <vector>
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// Extract the k-best list from the search graph.
+void KBestExtractor::Extract(
+ const std::vector<boost::shared_ptr<SVertex> > &topLevelVertices,
+ std::size_t k, KBestVec &kBestList)
+{
+ kBestList.clear();
+ if (topLevelVertices.empty()) {
+ return;
+ }
+
+ // Create a new SVertex, supremeVertex, that has the best top-level SVertex as
+ // its predecessor and has the same score.
+ std::vector<boost::shared_ptr<SVertex> >::const_iterator p =
+ topLevelVertices.begin();
+ SVertex &bestTopLevelVertex = **p;
+ boost::scoped_ptr<SVertex> supremeVertex(new SVertex());
+ supremeVertex->pvertex = 0;
+ supremeVertex->best = new SHyperedge();
+ supremeVertex->best->head = supremeVertex.get();
+ supremeVertex->best->tail.push_back(&bestTopLevelVertex);
+ supremeVertex->best->score = bestTopLevelVertex.best->score;
+ supremeVertex->best->scoreBreakdown = bestTopLevelVertex.best->scoreBreakdown;
+ supremeVertex->best->translation = 0;
+
+ // For each alternative top-level SVertex, add a new incoming hyperedge to
+ // supremeVertex.
+ for (++p; p != topLevelVertices.end(); ++p) {
+ // Check that the first item in topLevelVertices really was the best.
+ UTIL_THROW_IF2((*p)->best->score > bestTopLevelVertex.best->score,
+ "top-level SVertices are not correctly sorted");
+ // Note: there's no need for a smart pointer here: supremeVertex will take
+ // ownership of altEdge.
+ SHyperedge *altEdge = new SHyperedge();
+ altEdge->head = supremeVertex.get();
+ altEdge->tail.push_back((*p).get());
+ altEdge->score = (*p)->best->score;
+ altEdge->scoreBreakdown = (*p)->best->scoreBreakdown;
+ altEdge->translation = 0;
+ supremeVertex->recombined.push_back(altEdge);
+ }
+
+ // Create the target vertex then lazily fill its k-best list.
+ boost::shared_ptr<KVertex> targetVertex = FindOrCreateVertex(*supremeVertex);
+ LazyKthBest(targetVertex, k, k);
+
+ // Copy the k-best list from the target vertex, but drop the top edge from
+ // each derivation.
+ kBestList.reserve(targetVertex->kBestList.size());
+ for (std::vector<boost::weak_ptr<Derivation> >::const_iterator
+ q = targetVertex->kBestList.begin();
+ q != targetVertex->kBestList.end(); ++q) {
+ const boost::shared_ptr<Derivation> d(*q);
+ assert(d);
+ assert(d->subderivations.size() == 1);
+ kBestList.push_back(d->subderivations[0]);
+ }
+}
+
+// Generate the target-side yield of the derivation d.
+Phrase KBestExtractor::GetOutputPhrase(const Derivation &d)
+{
+ FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
+
+ Phrase ret(ARRAY_SIZE_INCR);
+
+ const TargetPhrase &phrase = *(d.edge->shyperedge.translation);
+ const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
+ phrase.GetAlignNonTerm().GetNonTermIndexMap();
+ for (std::size_t pos = 0; pos < phrase.GetSize(); ++pos) {
+ const Word &word = phrase.GetWord(pos);
+ if (word.IsNonTerminal()) {
+ std::size_t nonTermInd = nonTermIndexMap[pos];
+ const Derivation &subderivation = *d.subderivations[nonTermInd];
+ Phrase subPhrase = GetOutputPhrase(subderivation);
+ ret.Append(subPhrase);
+ } else {
+ ret.AddWord(word);
+ if (placeholderFactor == NOT_FOUND) {
+ continue;
+ }
+ // FIXME
+ UTIL_THROW2("placeholders are not currently supported by the S2T decoder");
+/*
+ std::set<std::size_t> sourcePosSet =
+ phrase.GetAlignTerm().GetAlignmentsForTarget(pos);
+ if (sourcePosSet.size() == 1) {
+ const std::vector<const Word*> *ruleSourceFromInputPath =
+ hypo.GetTranslationOption().GetSourceRuleFromInputPath();
+ UTIL_THROW_IF2(ruleSourceFromInputPath == NULL,
+ "Source Words in of the rules hasn't been filled out");
+ std::size_t sourcePos = *sourcePosSet.begin();
+ const Word *sourceWord = ruleSourceFromInputPath->at(sourcePos);
+ UTIL_THROW_IF2(sourceWord == NULL,
+ "Null source word at position " << sourcePos);
+ const Factor *factor = sourceWord->GetFactor(placeholderFactor);
+ if (factor) {
+ ret.Back()[0] = factor;
+ }
+ }
+*/
+ }
+ }
+
+ return ret;
+}
+
+// Generate the target tree of the derivation d.
+TreePointer KBestExtractor::GetOutputTree(const Derivation &d)
+{
+ const TargetPhrase &phrase = *(d.edge->shyperedge.translation);
+ if (const PhraseProperty *property = phrase.GetProperty("Tree")) {
+ const std::string *tree = property->GetValueString();
+ TreePointer mytree (boost::make_shared<InternalTree>(*tree));
+
+ //get subtrees (in target order)
+ std::vector<TreePointer> previous_trees;
+ for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
+ const Word &word = phrase.GetWord(pos);
+ if (word.IsNonTerminal()) {
+ size_t nonTermInd = phrase.GetAlignNonTerm().GetNonTermIndexMap()[pos];
+ const Derivation &subderivation = *d.subderivations[nonTermInd];
+ const TreePointer prev_tree = GetOutputTree(subderivation);
+ previous_trees.push_back(prev_tree);
+ }
+ }
+
+ mytree->Combine(previous_trees);
+ return mytree;
+ }
+ else {
+ UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
+ }
+}
+
+// Look for the vertex corresponding to a given SVertex, creating
+// a new one if necessary.
+boost::shared_ptr<KBestExtractor::KVertex>
+KBestExtractor::FindOrCreateVertex(const SVertex &v)
+{
+ // KVertex nodes should not be created for terminal nodes.
+ assert(v.best);
+
+ VertexMap::value_type element(&v, boost::shared_ptr<KVertex>());
+ std::pair<VertexMap::iterator, bool> p = m_vertexMap.insert(element);
+ boost::shared_ptr<KVertex> &sp = p.first->second;
+ if (!p.second) {
+ return sp; // KVertex was already in m_vertexMap.
+ }
+ sp.reset(new KVertex(v));
+ // Create the 1-best derivation and add it to the vertex's kBestList.
+ boost::shared_ptr<KHyperedge> bestEdge(new KHyperedge(*(v.best)));
+ bestEdge->head = sp;
+ std::size_t kTailSize = 0;
+ for (std::size_t i = 0; i < v.best->tail.size(); ++i) {
+ const SVertex *pred = v.best->tail[i];
+ if (pred->best) {
+ ++kTailSize;
+ }
+ }
+ bestEdge->tail.reserve(kTailSize);
+ for (std::size_t i = 0; i < v.best->tail.size(); ++i) {
+ const SVertex *pred = v.best->tail[i];
+ if (pred->best) {
+ bestEdge->tail.push_back(FindOrCreateVertex(*pred));
+ }
+ }
+ boost::shared_ptr<Derivation> bestDerivation(new Derivation(bestEdge));
+#ifndef NDEBUG
+ std::pair<DerivationSet::iterator, bool> q =
+#endif
+ m_derivations.insert(bestDerivation);
+ assert(q.second);
+ sp->kBestList.push_back(bestDerivation);
+ return sp;
+}
+
+// Create the 1-best derivation for each edge in BS(v) (except the best one)
+// and add it to v's candidate queue.
+void KBestExtractor::GetCandidates(boost::shared_ptr<KVertex> v, std::size_t k)
+{
+ // Create 1-best derivations for all of v's incoming edges except the best.
+ // The 1-best derivation for that edge will already have been created.
+ for (std::size_t i = 0; i < v->svertex.recombined.size(); ++i) {
+ const SHyperedge &shyperedge = *(v->svertex.recombined[i]);
+ boost::shared_ptr<KHyperedge> bestEdge(new KHyperedge(shyperedge));
+ bestEdge->head = v;
+ // Count the number of incoming vertices that are not terminals.
+ std::size_t kTailSize = 0;
+ for (std::size_t j = 0; j < shyperedge.tail.size(); ++j) {
+ const SVertex *pred = shyperedge.tail[j];
+ if (pred->best) {
+ ++kTailSize;
+ }
+ }
+ bestEdge->tail.reserve(kTailSize);
+ for (std::size_t j = 0; j < shyperedge.tail.size(); ++j) {
+ const SVertex *pred = shyperedge.tail[j];
+ if (pred->best) {
+ bestEdge->tail.push_back(FindOrCreateVertex(*pred));
+ }
+ }
+ boost::shared_ptr<Derivation> derivation(new Derivation(bestEdge));
+#ifndef NDEBUG
+ std::pair<DerivationSet::iterator, bool> q =
+#endif
+ m_derivations.insert(derivation);
+ assert(q.second);
+ v->candidates.push(derivation);
+ }
+}
+
+// Lazily fill v's k-best list.
+void KBestExtractor::LazyKthBest(boost::shared_ptr<KVertex> v, std::size_t k,
+ std::size_t globalK)
+{
+ // If this is the first visit to vertex v then initialize the priority queue.
+ if (v->visited == false) {
+ // The 1-best derivation should already be in v's k-best list.
+ assert(v->kBestList.size() == 1);
+ // Initialize v's priority queue.
+ GetCandidates(v, globalK);
+ v->visited = true;
+ }
+ // Add derivations to the k-best list until it contains k or there are none
+ // left to add.
+ while (v->kBestList.size() < k) {
+ assert(!v->kBestList.empty());
+ // Update the priority queue by adding the successors of the last
+ // derivation (unless they've been seen before).
+ boost::shared_ptr<Derivation> d(v->kBestList.back());
+ LazyNext(*v, *d, globalK);
+ // Check if there are any derivations left in the queue.
+ if (v->candidates.empty()) {
+ break;
+ }
+ // Get the next best derivation and delete it from the queue.
+ boost::weak_ptr<Derivation> next = v->candidates.top();
+ v->candidates.pop();
+ // Add it to the k-best list.
+ v->kBestList.push_back(next);
+ }
+}
+
+// Create the neighbours of Derivation d and add them to v's candidate queue.
+void KBestExtractor::LazyNext(KVertex &v, const Derivation &d,
+ std::size_t globalK)
+{
+ for (std::size_t i = 0; i < d.edge->tail.size(); ++i) {
+ boost::shared_ptr<KVertex> pred = d.edge->tail[i];
+ // Ensure that pred's k-best list contains enough derivations.
+ std::size_t k = d.backPointers[i] + 2;
+ LazyKthBest(pred, k, globalK);
+ if (pred->kBestList.size() < k) {
+ // pred's derivations have been exhausted.
+ continue;
+ }
+ // Create the neighbour.
+ boost::shared_ptr<Derivation> next(new Derivation(d, i));
+ // Check if it has been created before.
+ std::pair<DerivationSet::iterator, bool> p = m_derivations.insert(next);
+ if (p.second) {
+ v.candidates.push(next); // Haven't previously seen it.
+ }
+ }
+}
+
+// Construct the 1-best Derivation that ends at edge e.
+KBestExtractor::Derivation::Derivation(const boost::shared_ptr<KHyperedge> &e)
+{
+ edge = e;
+ std::size_t arity = edge->tail.size();
+ backPointers.resize(arity, 0);
+ subderivations.reserve(arity);
+ for (std::size_t i = 0; i < arity; ++i) {
+ const KVertex &pred = *(edge->tail[i]);
+ assert(pred.kBestList.size() >= 1);
+ boost::shared_ptr<Derivation> sub(pred.kBestList[0]);
+ subderivations.push_back(sub);
+ }
+ score = edge->shyperedge.score;
+ scoreBreakdown = edge->shyperedge.scoreBreakdown;
+}
+
+// Construct a Derivation that neighbours an existing Derivation.
+KBestExtractor::Derivation::Derivation(const Derivation &d, std::size_t i)
+{
+ edge = d.edge;
+ backPointers = d.backPointers;
+ subderivations = d.subderivations;
+ std::size_t j = ++backPointers[i];
+ scoreBreakdown = d.scoreBreakdown;
+ // Deduct the score of the old subderivation.
+ scoreBreakdown.MinusEquals(subderivations[i]->scoreBreakdown);
+ // Update the subderivation pointer.
+ boost::shared_ptr<Derivation> newSub(edge->tail[i]->kBestList[j]);
+ subderivations[i] = newSub;
+ // Add the score of the new subderivation.
+ scoreBreakdown.PlusEquals(subderivations[i]->scoreBreakdown);
+ score = scoreBreakdown.GetWeightedScore();
+}
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/KBestExtractor.h b/moses/Syntax/KBestExtractor.h
new file mode 100644
index 000000000..21fb6f737
--- /dev/null
+++ b/moses/Syntax/KBestExtractor.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include <cassert>
+
+#include <queue>
+#include <vector>
+
+#include <boost/unordered_set.hpp>
+#include <boost/weak_ptr.hpp>
+
+#include "moses/ScoreComponentCollection.h"
+#include "moses/FF/InternalTree.h"
+
+#include "SHyperedge.h"
+#include "SVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// k-best list extractor that implements algorithm 3 from this paper:
+//
+// Liang Huang and David Chiang
+// "Better k-best parsing"
+// In Proceedings of IWPT 2005
+//
+class KBestExtractor
+{
+ public:
+ struct KVertex;
+
+ struct KHyperedge {
+ KHyperedge(const SHyperedge &e) : shyperedge(e) {}
+
+ const SHyperedge &shyperedge;
+ boost::shared_ptr<KVertex> head;
+ std::vector<boost::shared_ptr<KVertex> > tail;
+ };
+
+ struct Derivation {
+ Derivation(const boost::shared_ptr<KHyperedge> &);
+ Derivation(const Derivation &, std::size_t);
+
+ boost::shared_ptr<KHyperedge> edge;
+ std::vector<std::size_t> backPointers;
+ std::vector<boost::shared_ptr<Derivation> > subderivations;
+ ScoreComponentCollection scoreBreakdown;
+ float score;
+ };
+
+ struct DerivationOrderer {
+ bool operator()(const boost::weak_ptr<Derivation> &d1,
+ const boost::weak_ptr<Derivation> &d2) const {
+ boost::shared_ptr<Derivation> s1(d1);
+ boost::shared_ptr<Derivation> s2(d2);
+ return s1->score < s2->score;
+ }
+ };
+
+ struct KVertex {
+ typedef std::priority_queue<boost::weak_ptr<Derivation>,
+ std::vector<boost::weak_ptr<Derivation> >,
+ DerivationOrderer> DerivationQueue;
+
+ KVertex(const SVertex &v) : svertex(v), visited(false) {}
+
+ const SVertex &svertex;
+ std::vector<boost::weak_ptr<Derivation> > kBestList;
+ DerivationQueue candidates;
+ bool visited;
+ };
+
+ typedef std::vector<boost::shared_ptr<Derivation> > KBestVec;
+
+ // Extract the k-best list from the search hypergraph given the full, sorted
+ // list of top-level SVertices.
+ void Extract(const std::vector<boost::shared_ptr<SVertex> > &, std::size_t,
+ KBestVec &);
+
+ static Phrase GetOutputPhrase(const Derivation &);
+ static TreePointer GetOutputTree(const Derivation &);
+
+ private:
+ typedef boost::unordered_map<const SVertex *,
+ boost::shared_ptr<KVertex> > VertexMap;
+
+ struct DerivationHasher {
+ std::size_t operator()(const boost::shared_ptr<Derivation> &d) const {
+ std::size_t seed = 0;
+ boost::hash_combine(seed, &(d->edge->shyperedge));
+ boost::hash_combine(seed, d->backPointers);
+ return seed;
+ }
+ };
+
+ struct DerivationEqualityPred {
+ bool operator()(const boost::shared_ptr<Derivation> &d1,
+ const boost::shared_ptr<Derivation> &d2) const {
+ return &(d1->edge->shyperedge) == &(d2->edge->shyperedge) &&
+ d1->backPointers == d2->backPointers;
+ }
+ };
+
+ typedef boost::unordered_set<boost::shared_ptr<Derivation>, DerivationHasher,
+ DerivationEqualityPred> DerivationSet;
+
+ boost::shared_ptr<KVertex> FindOrCreateVertex(const SVertex &);
+ void GetCandidates(boost::shared_ptr<KVertex>, std::size_t);
+ void LazyKthBest(boost::shared_ptr<KVertex>, std::size_t, std::size_t);
+ void LazyNext(KVertex &, const Derivation &, std::size_t);
+
+ VertexMap m_vertexMap;
+ DerivationSet m_derivations;
+};
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/NonTerminalMap.h b/moses/Syntax/NonTerminalMap.h
new file mode 100644
index 000000000..ff7ce2508
--- /dev/null
+++ b/moses/Syntax/NonTerminalMap.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "moses/FactorCollection.h"
+#include "moses/Word.h"
+
+#include "SymbolEqualityPred.h"
+#include "SymbolHasher.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// Hybrid map/vector-based container for key-value pairs where the key is a
+// non-terminal Word. The interface is like a (stripped-down) map type, with
+// the main differences being that:
+// 1. Find() is implemented using vector indexing to make it fast.
+// 2. Once a value has been inserted it can be modified but can't be removed.
+template<typename T>
+class NonTerminalMap
+{
+ private:
+ typedef boost::unordered_map<Word, T, SymbolHasher, SymbolEqualityPred> Map;
+ typedef std::vector<T*> Vec;
+
+ public:
+ typedef typename Map::iterator Iterator;
+ typedef typename Map::const_iterator ConstIterator;
+
+ NonTerminalMap()
+ : m_vec(FactorCollection::Instance().GetNumNonTerminals(), NULL) {}
+
+ Iterator Begin() { return m_map.begin(); }
+ Iterator End() { return m_map.end(); }
+
+ ConstIterator Begin() const { return m_map.begin(); }
+ ConstIterator End() const { return m_map.end(); }
+
+ std::size_t Size() const { return m_map.size(); }
+
+ bool IsEmpty() const { return m_map.empty(); }
+
+ std::pair<Iterator, bool> Insert(const Word &, const T &);
+
+ T *Find(const Word &w) const { return m_vec[w[0]->GetId()]; }
+
+ private:
+ Map m_map;
+ Vec m_vec;
+};
+
+template<typename T>
+std::pair<typename NonTerminalMap<T>::Iterator, bool> NonTerminalMap<T>::Insert(
+ const Word &key, const T &value)
+{
+ std::pair<typename Map::iterator, bool> result =
+ m_map.insert(typename Map::value_type(key, value));
+ if (result.second) {
+ T *p = &(result.first->second);
+ std::size_t i = key[0]->GetId();
+ m_vec[i] = p;
+ }
+ return result;
+}
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/PHyperedge.h b/moses/Syntax/PHyperedge.h
new file mode 100644
index 000000000..8f236fcb8
--- /dev/null
+++ b/moses/Syntax/PHyperedge.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/TargetPhraseCollection.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct PVertex;
+
+struct PHyperedge
+{
+ PVertex *head;
+ std::vector<PVertex*> tail;
+ const TargetPhraseCollection *translations;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/PVertex.h b/moses/Syntax/PVertex.h
new file mode 100644
index 000000000..d82309c82
--- /dev/null
+++ b/moses/Syntax/PVertex.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "moses/Word.h"
+#include "moses/WordsRange.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct PVertex
+{
+ public:
+ PVertex(const WordsRange &wr, const Word &w) : span(wr), symbol(w) {}
+
+ WordsRange span;
+ Word symbol;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/RuleTable.h b/moses/Syntax/RuleTable.h
new file mode 100644
index 000000000..90a25d63c
--- /dev/null
+++ b/moses/Syntax/RuleTable.h
@@ -0,0 +1,24 @@
+#pragma once
+
+namespace Moses
+{
+namespace Syntax
+{
+
+class RuleTableFF;
+
+// Base class for any data structure representing a synchronous
+// grammar, like a trie (for S2T) or a DFA (for T2S).
+class RuleTable
+{
+ public:
+ RuleTable(const RuleTableFF *ff) : m_ff(ff) {}
+
+ virtual ~RuleTable() {}
+
+ protected:
+ const RuleTableFF *m_ff;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/RuleTableFF.cpp b/moses/Syntax/RuleTableFF.cpp
new file mode 100644
index 000000000..771c3983c
--- /dev/null
+++ b/moses/Syntax/RuleTableFF.cpp
@@ -0,0 +1,51 @@
+#include "RuleTableFF.h"
+
+#include "moses/StaticData.h"
+#include "moses/Syntax/S2T/RuleTrieCYKPlus.h"
+#include "moses/Syntax/S2T/RuleTrieLoader.h"
+#include "moses/Syntax/S2T/RuleTrieScope3.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+std::vector<RuleTableFF*> RuleTableFF::s_instances;
+
+RuleTableFF::RuleTableFF(const std::string &line)
+ : PhraseDictionary(line)
+{
+ ReadParameters();
+ // caching for memory pt is pointless
+ m_maxCacheSize = 0;
+
+ s_instances.push_back(this);
+}
+
+void RuleTableFF::Load()
+{
+ SetFeaturesToApply();
+
+ const StaticData &staticData = StaticData::Instance();
+ if (!staticData.UseS2TDecoder()) {
+ UTIL_THROW2("ERROR: RuleTableFF currently only supports S2T decoder");
+ } else {
+ S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
+ if (algorithm == RecursiveCYKPlus) {
+ S2T::RuleTrieCYKPlus *trie = new S2T::RuleTrieCYKPlus(this);
+ S2T::RuleTrieLoader loader;
+ loader.Load(m_input, m_output, m_filePath, *this, *trie);
+ m_table = trie;
+ } else if (algorithm == Scope3) {
+ S2T::RuleTrieScope3 *trie = new S2T::RuleTrieScope3(this);
+ S2T::RuleTrieLoader loader;
+ loader.Load(m_input, m_output, m_filePath, *this, *trie);
+ m_table = trie;
+ } else {
+ UTIL_THROW2("ERROR: unhandled S2T parsing algorithm");
+ }
+ }
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/RuleTableFF.h b/moses/Syntax/RuleTableFF.h
new file mode 100644
index 000000000..0e6040612
--- /dev/null
+++ b/moses/Syntax/RuleTableFF.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <string>
+
+#include "moses/TranslationModel/PhraseDictionary.h"
+
+namespace Moses
+{
+
+class ChartParser;
+class ChartCellCollectionBase;
+
+namespace Syntax
+{
+
+class RuleTable;
+
+// Feature function for dealing with local rule scores (that come from a
+// rule table). The scores themselves are stored on TargetPhrase objects
+// and the decoder accesses them directly, so this object doesn't really do
+// anything except provide somewhere to store the weights and parameter values.
+class RuleTableFF : public PhraseDictionary
+{
+ public:
+ RuleTableFF(const std::string &);
+
+ // FIXME Delete m_table?
+ ~RuleTableFF() {}
+
+ void Load();
+
+ const RuleTable *GetTable() const { return m_table; }
+
+ static const std::vector<RuleTableFF*> &Instances() { return s_instances; }
+
+ ChartRuleLookupManager *CreateRuleLookupManager(
+ const ChartParser &, const ChartCellCollectionBase &, std::size_t)
+ {
+ assert(false);
+ return 0;
+ }
+
+ private:
+ static std::vector<RuleTableFF*> s_instances;
+
+ const RuleTable *m_table;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/DerivationWriter.cpp b/moses/Syntax/S2T/DerivationWriter.cpp
new file mode 100644
index 000000000..dcb98b3c6
--- /dev/null
+++ b/moses/Syntax/S2T/DerivationWriter.cpp
@@ -0,0 +1,100 @@
+#include "DerivationWriter.h"
+
+#include "moses/Factor.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SHyperedge.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// 1-best version.
+void DerivationWriter::Write(const SHyperedge &shyperedge,
+ std::size_t sentNum, std::ostream &out)
+{
+ WriteLine(shyperedge, sentNum, out);
+ for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) {
+ const SVertex &pred = *(shyperedge.tail[i]);
+ if (pred.best) {
+ Write(*pred.best, sentNum, out);
+ }
+ }
+}
+
+// k-best derivation.
+void DerivationWriter::Write(const KBestExtractor::Derivation &derivation,
+ std::size_t sentNum, std::ostream &out)
+{
+ WriteLine(derivation.edge->shyperedge, sentNum, out);
+ for (std::size_t i = 0; i < derivation.subderivations.size(); ++i) {
+ Write(*(derivation.subderivations[i]), sentNum, out);
+ }
+}
+
+void DerivationWriter::WriteLine(const SHyperedge &shyperedge,
+ std::size_t sentNum, std::ostream &out)
+{
+ // Sentence number.
+ out << sentNum << " |||";
+
+ // Source LHS.
+ out << " [X] ->";
+
+ // Source RHS symbols.
+ for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) {
+ const Word &symbol = shyperedge.tail[i]->pvertex->symbol;
+ out << " ";
+ if (symbol.IsNonTerminal()) {
+ out << "[X]";
+ } else {
+ WriteSymbol(symbol, out);
+ }
+ }
+ out << " |||";
+
+ // Target RHS.
+ out << " ";
+ WriteSymbol(shyperedge.head->pvertex->symbol, out);
+ out << " ->";
+
+ // Target RHS symbols.
+ const TargetPhrase &phrase = *(shyperedge.translation);
+ for (std::size_t i = 0; i < phrase.GetSize(); ++i) {
+ out << " ";
+ WriteSymbol(phrase.GetWord(i), out);
+ }
+ out << " |||";
+
+ // Non-terminal alignments
+ const AlignmentInfo &a = phrase.GetAlignNonTerm();
+ for (AlignmentInfo::const_iterator p = a.begin(); p != a.end(); ++p) {
+ out << " " << p->first << "-" << p->second;
+ }
+ out << " |||";
+
+ // Spans covered by source RHS symbols.
+ for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) {
+ const SVertex *child = shyperedge.tail[i];
+ const WordsRange &span = child->pvertex->span;
+ out << " " << span.GetStartPos() << ".." << span.GetEndPos();
+ }
+
+ out << "\n";
+}
+
+void DerivationWriter::WriteSymbol(const Word &symbol, std::ostream &out)
+{
+ const Factor *f = symbol[0];
+ if (symbol.IsNonTerminal()) {
+ out << "[" << f->GetString() << "]";
+ } else {
+ out << f->GetString();
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/DerivationWriter.h b/moses/Syntax/S2T/DerivationWriter.h
new file mode 100644
index 000000000..706490ce0
--- /dev/null
+++ b/moses/Syntax/S2T/DerivationWriter.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <ostream>
+
+#include "moses/Syntax/KBestExtractor.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+struct SHyperedge;
+
+namespace S2T
+{
+
+// Writes a string representation of a derivation to a std::ostream. This is
+// used by the -translation-details / -T option.
+// TODO DerivationWriter currently assumes string-to-tree (which is why it's
+// TODO in the S2T namespace) but it would be easy to generalise it. This
+// TODO should be revisited when other the decoders are implemented.
+class DerivationWriter
+{
+ public:
+ // 1-best version.
+ static void Write(const SHyperedge&, std::size_t, std::ostream &);
+
+ // k-best version.
+ static void Write(const KBestExtractor::Derivation &, std::size_t,
+ std::ostream &);
+ private:
+ static void WriteLine(const SHyperedge &, std::size_t, std::ostream &);
+ static void WriteSymbol(const Word &, std::ostream &);
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Manager-inl.h b/moses/Syntax/S2T/Manager-inl.h
new file mode 100644
index 000000000..a08c320f0
--- /dev/null
+++ b/moses/Syntax/S2T/Manager-inl.h
@@ -0,0 +1,387 @@
+#pragma once
+
+#include "moses/DecodeGraph.h"
+#include "moses/StaticData.h"
+#include "moses/Syntax/BoundedPriorityContainer.h"
+#include "moses/Syntax/CubeQueue.h"
+#include "moses/Syntax/PHyperedge.h"
+#include "moses/Syntax/RuleTable.h"
+#include "moses/Syntax/RuleTableFF.h"
+#include "moses/Syntax/SHyperedgeBundle.h"
+#include "moses/Syntax/SVertex.h"
+#include "moses/Syntax/SVertexRecombinationOrderer.h"
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+
+#include "OovHandler.h"
+#include "PChart.h"
+#include "RuleTrie.h"
+#include "SChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename Parser>
+Manager<Parser>::Manager(const InputType &source)
+ : m_source(source)
+ , m_pchart(source.GetSize(), Parser::RequiresCompressedChart())
+ , m_schart(source.GetSize())
+{
+}
+
+template<typename Parser>
+void Manager<Parser>::InitializeCharts()
+{
+ // Create a PVertex object and a SVertex object for each source word.
+ for (std::size_t i = 0; i < m_source.GetSize(); ++i) {
+ const Word &terminal = m_source.GetWord(i);
+
+ // PVertex
+ PVertex tmp(WordsRange(i,i), m_source.GetWord(i));
+ PVertex &pvertex = m_pchart.AddVertex(tmp);
+
+ // SVertex
+ boost::shared_ptr<SVertex> v(new SVertex());
+ v->best = 0;
+ v->pvertex = &pvertex;
+ SChart::Cell &scell = m_schart.GetCell(i,i);
+ SVertexStack stack(1, v);
+ SChart::Cell::TMap::value_type x(terminal, stack);
+ scell.terminalStacks.insert(x);
+ }
+}
+
+template<typename Parser>
+void Manager<Parser>::InitializeParsers(PChart &pchart,
+ std::size_t ruleLimit)
+{
+ const std::vector<RuleTableFF*> &ffs = RuleTableFF::Instances();
+
+ const std::vector<DecodeGraph*> &graphs =
+ StaticData::Instance().GetDecodeGraphs();
+
+ UTIL_THROW_IF2(ffs.size() != graphs.size(),
+ "number of RuleTables does not match number of decode graphs");
+
+ for (std::size_t i = 0; i < ffs.size(); ++i) {
+ RuleTableFF *ff = ffs[i];
+ std::size_t maxChartSpan = graphs[i]->GetMaxChartSpan();
+ // This may change in the future, but currently we assume that every
+ // RuleTableFF is associated with a static, file-based rule table of
+ // some sort and that the table should have been loaded into a RuleTable
+ // by this point.
+ const RuleTable *table = ff->GetTable();
+ assert(table);
+ RuleTable *nonConstTable = const_cast<RuleTable*>(table);
+ boost::shared_ptr<Parser> parser;
+ typename Parser::RuleTrie *trie =
+ dynamic_cast<typename Parser::RuleTrie*>(nonConstTable);
+ assert(trie);
+ parser.reset(new Parser(pchart, *trie, maxChartSpan));
+ m_parsers.push_back(parser);
+ }
+
+ // Check for OOVs and synthesize an additional rule trie + parser if
+ // necessary.
+ m_oovs.clear();
+ std::size_t maxOovWidth = 0;
+ FindOovs(pchart, m_oovs, maxOovWidth);
+ if (!m_oovs.empty()) {
+ // FIXME Add a hidden RuleTableFF for unknown words(?)
+ OovHandler<typename Parser::RuleTrie> oovHandler(*ffs[0]);
+ m_oovRuleTrie = oovHandler.SynthesizeRuleTrie(m_oovs.begin(), m_oovs.end());
+ // Create a parser for the OOV rule trie.
+ boost::shared_ptr<Parser> parser(
+ new Parser(pchart, *m_oovRuleTrie, maxOovWidth));
+ m_parsers.push_back(parser);
+ }
+}
+
+// Find the set of OOVs for this input. This function assumes that the
+// PChart argument has already been initialized from the input.
+template<typename Parser>
+void Manager<Parser>::FindOovs(const PChart &pchart, std::set<Word> &oovs,
+ std::size_t maxOovWidth)
+{
+ // Get the set of RuleTries.
+ std::vector<const RuleTrie *> tries;
+ const std::vector<RuleTableFF*> &ffs = RuleTableFF::Instances();
+ for (std::size_t i = 0; i < ffs.size(); ++i) {
+ const RuleTableFF *ff = ffs[i];
+ if (ff->GetTable()) {
+ const RuleTrie *trie = dynamic_cast<const RuleTrie*>(ff->GetTable());
+ assert(trie); // FIXME
+ tries.push_back(trie);
+ }
+ }
+
+ // For every sink vertex in pchart (except for <s> and </s>), check whether
+ // the word has a preterminal rule in any of the rule tables. If not then
+ // add it to the OOV set.
+ oovs.clear();
+ maxOovWidth = 0;
+ // Assume <s> and </s> have been added at sentence boundaries, so skip
+ // cells starting at position 0 and ending at the last position.
+ for (std::size_t i = 1; i < pchart.GetWidth()-1; ++i) {
+ for (std::size_t j = i; j < pchart.GetWidth()-1; ++j) {
+ std::size_t width = j-i+1;
+ const PChart::Cell::TMap &map = pchart.GetCell(i,j).terminalVertices;
+ for (PChart::Cell::TMap::const_iterator p = map.begin();
+ p != map.end(); ++p) {
+ const Word &word = p->first;
+ assert(!word.IsNonTerminal());
+ bool found = false;
+ for (std::vector<const RuleTrie *>::const_iterator q = tries.begin();
+ q != tries.end(); ++q) {
+ const RuleTrie *trie = *q;
+ if (trie->HasPreterminalRule(word)) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ oovs.insert(word);
+ maxOovWidth = std::max(maxOovWidth, width);
+ }
+ }
+ }
+ }
+}
+
+template<typename Parser>
+void Manager<Parser>::Decode()
+{
+ const StaticData &staticData = StaticData::Instance();
+
+ // Get various pruning-related constants.
+ const std::size_t popLimit = staticData.GetCubePruningPopLimit();
+ const std::size_t ruleLimit = staticData.GetRuleLimit();
+ const std::size_t stackLimit = staticData.GetMaxHypoStackSize();
+
+ // Initialise the PChart and SChart.
+ InitializeCharts();
+
+ // Initialize the parsers.
+ InitializeParsers(m_pchart, ruleLimit);
+
+ // Create a callback to process the PHyperedges produced by the parsers.
+ typename Parser::CallbackType callback(m_schart, ruleLimit);
+
+ // Visit each cell of PChart in right-to-left depth-first order.
+ std::size_t size = m_source.GetSize();
+ for (int start = size-1; start >= 0; --start) {
+ for (std::size_t width = 1; width <= size-start; ++width) {
+ std::size_t end = start + width - 1;
+
+ //PChart::Cell &pcell = m_pchart.GetCell(start, end);
+ SChart::Cell &scell = m_schart.GetCell(start, end);
+
+ WordsRange range(start, end);
+
+ // Call the parsers to generate PHyperedges for this span and convert
+ // each one to a SHyperedgeBundle (via the callback). The callback
+ // prunes the SHyperedgeBundles and keeps the best ones (up to ruleLimit).
+ callback.InitForRange(range);
+ for (typename std::vector<boost::shared_ptr<Parser> >::iterator
+ p = m_parsers.begin(); p != m_parsers.end(); ++p) {
+ (*p)->EnumerateHyperedges(range, callback);
+ }
+
+ // Retrieve the (pruned) set of SHyperedgeBundles from the callback.
+ const BoundedPriorityContainer<SHyperedgeBundle> &bundles =
+ callback.GetContainer();
+
+ // Use cube pruning to extract SHyperedges from SHyperedgeBundles.
+ // Collect the SHyperedges into buffers, one for each category.
+ CubeQueue cubeQueue(bundles.Begin(), bundles.End());
+ std::size_t count = 0;
+ typedef boost::unordered_map<Word, std::vector<SHyperedge*>,
+ SymbolHasher, SymbolEqualityPred > BufferMap;
+ BufferMap buffers;
+ while (count < popLimit && !cubeQueue.IsEmpty()) {
+ SHyperedge *hyperedge = cubeQueue.Pop();
+ // BEGIN{HACK}
+ // The way things currently work, the LHS of each hyperedge is not
+ // determined until just before the point of its creation, when a
+ // target phrase is selected from the list of possible phrases (which
+ // happens during cube pruning). The cube pruning code doesn't (and
+ // shouldn't) know about the contents of PChart and so creation of
+ // the PVertex is deferred until this point.
+ const Word &lhs = hyperedge->translation->GetTargetLHS();
+ hyperedge->head->pvertex = &m_pchart.AddVertex(PVertex(range, lhs));
+ // END{HACK}
+ buffers[lhs].push_back(hyperedge);
+ ++count;
+ }
+
+ // Recombine SVertices and sort into stacks.
+ for (BufferMap::const_iterator p = buffers.begin(); p != buffers.end();
+ ++p) {
+ const Word &category = p->first;
+ const std::vector<SHyperedge*> &buffer = p->second;
+ std::pair<SChart::Cell::NMap::Iterator, bool> ret =
+ scell.nonTerminalStacks.Insert(category, SVertexStack());
+ assert(ret.second);
+ SVertexStack &stack = ret.first->second;
+ RecombineAndSort(buffer, stack);
+ }
+
+ // Prune stacks.
+ if (stackLimit > 0) {
+ for (SChart::Cell::NMap::Iterator p = scell.nonTerminalStacks.Begin();
+ p != scell.nonTerminalStacks.End(); ++p) {
+ SVertexStack &stack = p->second;
+ if (stack.size() > stackLimit) {
+ stack.resize(stackLimit);
+ }
+ }
+ }
+
+ // Prune the PChart cell for this span by removing vertices for
+ // categories that don't occur in the SChart.
+// Note: see HACK above. Pruning the chart isn't currently necessary.
+// PrunePChart(scell, pcell);
+ }
+ }
+}
+
+template<typename Parser>
+const SHyperedge *Manager<Parser>::GetBestSHyperedge() const
+{
+ const SChart::Cell &cell = m_schart.GetCell(0, m_source.GetSize()-1);
+ const SChart::Cell::NMap &stacks = cell.nonTerminalStacks;
+ if (stacks.Size() == 0) {
+ return 0;
+ }
+ assert(stacks.Size() == 1);
+ const std::vector<boost::shared_ptr<SVertex> > &stack = stacks.Begin()->second;
+ return stack[0]->best;
+}
+
+template<typename Parser>
+void Manager<Parser>::ExtractKBest(
+ std::size_t k,
+ std::vector<boost::shared_ptr<KBestExtractor::Derivation> > &kBestList,
+ bool onlyDistinct) const
+{
+ kBestList.clear();
+ if (k == 0 || m_source.GetSize() == 0) {
+ return;
+ }
+
+ // Get the top-level SVertex stack.
+ const SChart::Cell &cell = m_schart.GetCell(0, m_source.GetSize()-1);
+ const SChart::Cell::NMap &stacks = cell.nonTerminalStacks;
+ if (stacks.Size() == 0) {
+ return;
+ }
+ assert(stacks.Size() == 1);
+ const std::vector<boost::shared_ptr<SVertex> > &stack = stacks.Begin()->second;
+
+ KBestExtractor extractor;
+
+ if (!onlyDistinct) {
+ // Return the k-best list as is, including duplicate translations.
+ extractor.Extract(stack, k, kBestList);
+ return;
+ }
+
+ // Determine how many derivations to extract. If the k-best list is
+ // restricted to distinct translations then this limit should be bigger
+ // than k. The k-best factor determines how much bigger the limit should be,
+ // with 0 being 'unlimited.' This actually sets a large-ish limit in case
+ // too many translations are identical.
+ const StaticData &staticData = StaticData::Instance();
+ const std::size_t nBestFactor = staticData.GetNBestFactor();
+ std::size_t numDerivations = (nBestFactor == 0) ? k*1000 : k*nBestFactor;
+
+ // Extract the derivations.
+ KBestExtractor::KBestVec bigList;
+ bigList.reserve(numDerivations);
+ extractor.Extract(stack, numDerivations, bigList);
+
+ // Copy derivations into kBestList, skipping ones with repeated translations.
+ std::set<Phrase> distinct;
+ for (KBestExtractor::KBestVec::const_iterator p = bigList.begin();
+ kBestList.size() < k && p != bigList.end(); ++p) {
+ boost::shared_ptr<KBestExtractor::Derivation> derivation = *p;
+ Phrase translation = KBestExtractor::GetOutputPhrase(*derivation);
+ if (distinct.insert(translation).second) {
+ kBestList.push_back(derivation);
+ }
+ }
+}
+
+template<typename Parser>
+void Manager<Parser>::PrunePChart(const SChart::Cell &scell,
+ PChart::Cell &pcell)
+{
+/* FIXME
+ PChart::Cell::VertexMap::iterator p = pcell.vertices.begin();
+ while (p != pcell.vertices.end()) {
+ const Word &category = p->first;
+ if (scell.stacks.find(category) == scell.stacks.end()) {
+ PChart::Cell::VertexMap::iterator q = p++;
+ pcell.vertices.erase(q);
+ } else {
+ ++p;
+ }
+ }
+*/
+}
+
+template<typename Parser>
+void Manager<Parser>::RecombineAndSort(const std::vector<SHyperedge*> &buffer,
+ SVertexStack &stack)
+{
+ // Step 1: Create a map containing a single instance of each distinct vertex
+ // (where distinctness is defined by the state value). The hyperedges'
+ // head pointers are updated to point to the vertex instances in the map and
+ // any 'duplicate' vertices are deleted.
+// TODO Set?
+ typedef std::map<SVertex *, SVertex *, SVertexRecombinationOrderer> Map;
+ Map map;
+ for (std::vector<SHyperedge*>::const_iterator p = buffer.begin();
+ p != buffer.end(); ++p) {
+ SHyperedge *h = *p;
+ SVertex *v = h->head;
+ assert(v->best == h);
+ assert(v->recombined.empty());
+ std::pair<Map::iterator, bool> result = map.insert(Map::value_type(v, v));
+ if (result.second) {
+ continue; // v's recombination value hasn't been seen before.
+ }
+ // v is a duplicate (according to the recombination rules).
+ // Compare the score of h against the score of the best incoming hyperedge
+ // for the stored vertex.
+ SVertex *storedVertex = result.first->second;
+ if (h->score > storedVertex->best->score) {
+ // h's score is better.
+ storedVertex->recombined.push_back(storedVertex->best);
+ storedVertex->best = h;
+ } else {
+ storedVertex->recombined.push_back(h);
+ }
+ h->head->best = 0;
+ delete h->head;
+ h->head = storedVertex;
+ }
+
+ // Step 2: Copy the vertices from the map to the stack.
+ stack.clear();
+ stack.reserve(map.size());
+ for (Map::const_iterator p = map.begin(); p != map.end(); ++p) {
+ stack.push_back(boost::shared_ptr<SVertex>(p->first));
+ }
+
+ // Step 3: Sort the vertices in the stack.
+ std::sort(stack.begin(), stack.end(), SVertexStackContentOrderer());
+}
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/Manager.h b/moses/Syntax/S2T/Manager.h
new file mode 100644
index 000000000..f6bea903d
--- /dev/null
+++ b/moses/Syntax/S2T/Manager.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+
+#include "moses/InputType.h"
+#include "moses/Syntax/KBestExtractor.h"
+#include "moses/Syntax/SVertexStack.h"
+
+#include "OovHandler.h"
+#include "ParserCallback.h"
+#include "PChart.h"
+#include "SChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+class SDerivation;
+struct SHyperedge;
+
+namespace S2T
+{
+
+template<typename Parser>
+class Manager
+{
+ public:
+ Manager(const InputType &);
+
+ void Decode();
+
+ // Get the SHyperedge for the 1-best derivation.
+ const SHyperedge *GetBestSHyperedge() const;
+
+ void ExtractKBest(
+ std::size_t k,
+ std::vector<boost::shared_ptr<KBestExtractor::Derivation> > &kBestList,
+ bool onlyDistinct=false) const;
+
+ const std::set<Word> &GetUnknownWords() const { return m_oovs; }
+
+ private:
+ void FindOovs(const PChart &, std::set<Word> &, std::size_t);
+
+ void InitializeCharts();
+
+ void InitializeParsers(PChart &, std::size_t);
+
+ void RecombineAndSort(const std::vector<SHyperedge*> &, SVertexStack &);
+
+ void PrunePChart(const SChart::Cell &, PChart::Cell &);
+
+ const InputType &m_source;
+ PChart m_pchart;
+ SChart m_schart;
+ std::set<Word> m_oovs;
+ boost::shared_ptr<typename Parser::RuleTrie> m_oovRuleTrie;
+ std::vector<boost::shared_ptr<Parser> > m_parsers;
+};
+
+} // S2T
+} // Syntax
+} // Moses
+
+// Implementation
+#include "Manager-inl.h"
diff --git a/moses/Syntax/S2T/OovHandler-inl.h b/moses/Syntax/S2T/OovHandler-inl.h
new file mode 100644
index 000000000..e700f65c5
--- /dev/null
+++ b/moses/Syntax/S2T/OovHandler-inl.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include "moses/FF/UnknownWordPenaltyProducer.h"
+#include "moses/StaticData.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename RuleTrie>
+template<typename InputIterator>
+boost::shared_ptr<RuleTrie> OovHandler<RuleTrie>::SynthesizeRuleTrie(
+ InputIterator first, InputIterator last)
+{
+ const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
+
+ boost::shared_ptr<RuleTrie> trie(new RuleTrie(&m_ruleTableFF));
+
+ while (first != last) {
+ const Word &oov = *first++;
+ if (ShouldDrop(oov)) {
+ continue;
+ }
+ boost::scoped_ptr<Phrase> srcPhrase(SynthesizeSourcePhrase(oov));
+ for (UnknownLHSList::const_iterator p = lhsList.begin();
+ p != lhsList.end(); ++p) {
+ const std::string &targetLhsStr = p->first;
+ float prob = p->second;
+// TODO Check ownership and fix any leaks.
+ Word *tgtLHS = SynthesizeTargetLhs(targetLhsStr);
+ TargetPhrase *tp = SynthesizeTargetPhrase(oov, *srcPhrase, *tgtLHS, prob);
+ TargetPhraseCollection &tpc = GetOrCreateTargetPhraseCollection(
+ *trie, *srcPhrase, *tp, NULL); // TODO Check NULL is valid argument
+ tpc.Add(tp);
+ }
+ }
+
+ return trie;
+}
+
+template<typename RuleTrie>
+Phrase *OovHandler<RuleTrie>::SynthesizeSourcePhrase(const Word &sourceWord)
+{
+ Phrase *phrase = new Phrase(1);
+ phrase->AddWord() = sourceWord;
+ phrase->GetWord(0).SetIsOOV(true);
+ return phrase;
+}
+
+template<typename RuleTrie>
+Word *OovHandler<RuleTrie>::SynthesizeTargetLhs(const std::string &lhsStr)
+{
+ Word *targetLhs = new Word(true);
+ targetLhs->CreateFromString(Output,
+ StaticData::Instance().GetOutputFactorOrder(),
+ lhsStr, true);
+ UTIL_THROW_IF2(targetLhs->GetFactor(0) == NULL, "Null factor for target LHS");
+ return targetLhs;
+}
+
+template<typename RuleTrie>
+TargetPhrase *OovHandler<RuleTrie>::SynthesizeTargetPhrase(
+ const Word &oov, const Phrase &srcPhrase, const Word &targetLhs, float prob)
+{
+ const StaticData &staticData = StaticData::Instance();
+
+ const UnknownWordPenaltyProducer &unknownWordPenaltyProducer =
+ UnknownWordPenaltyProducer::Instance();
+
+ TargetPhrase *targetPhrase = new TargetPhrase();
+ Word &targetWord = targetPhrase->AddWord();
+ targetWord.CreateUnknownWord(oov);
+
+ // scores
+ float score = FloorScore(TransformScore(prob));
+
+ targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, score);
+ targetPhrase->EvaluateInIsolation(srcPhrase);
+ targetPhrase->SetTargetLHS(&targetLhs);
+ targetPhrase->SetAlignmentInfo("0-0");
+ if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() ||
+ staticData.GetTreeStructure() != NULL) {
+ std::string value = "[ " + targetLhs[0]->GetString().as_string() + " " +
+ oov[0]->GetString().as_string() + " ]";
+ targetPhrase->SetProperty("Tree", value);
+ }
+
+ return targetPhrase;
+}
+
+template<typename RuleTrie>
+bool OovHandler<RuleTrie>::ShouldDrop(const Word &oov)
+{
+ if (!StaticData::Instance().GetDropUnknown()) {
+ return false;
+ }
+ const Factor *f = oov[0]; // TODO hack. shouldn't know which factor is surface
+ const StringPiece s = f->GetString();
+ return s.find_first_of("0123456789") != std::string::npos;
+}
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/OovHandler.h b/moses/Syntax/S2T/OovHandler.h
new file mode 100644
index 000000000..b74e697c5
--- /dev/null
+++ b/moses/Syntax/S2T/OovHandler.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <string>
+
+#include <boost/shared_ptr.hpp>
+
+#include "moses/Syntax/RuleTableFF.h"
+#include "moses/TargetPhrase.h"
+#include "moses/Word.h"
+
+#include "RuleTrieCreator.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename RuleTrie>
+class OovHandler : public RuleTrieCreator
+{
+ public:
+ OovHandler(const RuleTableFF &ff) : m_ruleTableFF(ff) {}
+
+ // Synthesize a RuleTrie given a sequence of OOV words. The sequence is
+ // specified by a pair of iterators (indicating the beginning and end). It
+ // is assumed not to contain duplicates.
+ template<typename InputIterator>
+ boost::shared_ptr<RuleTrie> SynthesizeRuleTrie(InputIterator, InputIterator);
+
+ private:
+ const RuleTableFF &m_ruleTableFF;
+
+ bool ShouldDrop(const Word &);
+
+ Phrase *SynthesizeSourcePhrase(const Word &);
+
+ Word *SynthesizeTargetLhs(const std::string &);
+
+ TargetPhrase *SynthesizeTargetPhrase(const Word &, const Phrase &,
+ const Word &, float);
+};
+
+} // S2T
+} // Syntax
+} // Moses
+
+#include "OovHandler-inl.h"
diff --git a/moses/Syntax/S2T/PChart.cpp b/moses/Syntax/S2T/PChart.cpp
new file mode 100644
index 000000000..de62e7a84
--- /dev/null
+++ b/moses/Syntax/S2T/PChart.cpp
@@ -0,0 +1,34 @@
+#include "PChart.h"
+
+#include "moses/FactorCollection.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+PChart::PChart(std::size_t width, bool maintainCompressedChart)
+{
+ m_cells.resize(width);
+ for (std::size_t i = 0; i < width; ++i) {
+ m_cells[i].resize(width);
+ }
+ if (maintainCompressedChart) {
+ m_compressedChart = new CompressedChart(width);
+ for (CompressedChart::iterator p = m_compressedChart->begin();
+ p != m_compressedChart->end(); ++p) {
+ p->resize(FactorCollection::Instance().GetNumNonTerminals());
+ }
+ }
+}
+
+PChart::~PChart()
+{
+ delete m_compressedChart;
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/PChart.h b/moses/Syntax/S2T/PChart.h
new file mode 100644
index 000000000..8f719eebb
--- /dev/null
+++ b/moses/Syntax/S2T/PChart.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "moses/Syntax/NonTerminalMap.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class PChart
+{
+ public:
+ struct Cell
+ {
+ typedef boost::unordered_map<Word, PVertex, SymbolHasher,
+ SymbolEqualityPred> TMap;
+ typedef NonTerminalMap<PVertex> NMap;
+ // Collection of terminal vertices (keyed by terminal symbol).
+ TMap terminalVertices;
+ // Collection of non-terminal vertices (keyed by non-terminal symbol).
+ NMap nonTerminalVertices;
+ };
+
+ struct CompressedItem {
+ std::size_t end;
+ const PVertex *vertex;
+ };
+
+ typedef std::vector<std::vector<CompressedItem> > CompressedMatrix;
+
+ PChart(std::size_t width, bool maintainCompressedChart);
+
+ ~PChart();
+
+ std::size_t GetWidth() const { return m_cells.size(); }
+
+ const Cell &GetCell(std::size_t start, std::size_t end) const {
+ return m_cells[start][end];
+ }
+
+ // Insert the given PVertex and return a reference to the inserted object.
+ PVertex &AddVertex(const PVertex &v) {
+ const std::size_t start = v.span.GetStartPos();
+ const std::size_t end = v.span.GetEndPos();
+ Cell &cell = m_cells[start][end];
+ // If v is a terminal vertex add it to the cell's terminalVertices map.
+ if (!v.symbol.IsNonTerminal()) {
+ Cell::TMap::value_type x(v.symbol, v);
+ std::pair<Cell::TMap::iterator, bool> ret =
+ cell.terminalVertices.insert(x);
+ return ret.first->second;
+ }
+ // If v is a non-terminal vertex add it to the cell's nonTerminalVertices
+ // map and update the compressed chart (if enabled).
+ std::pair<Cell::NMap::Iterator, bool> result =
+ cell.nonTerminalVertices.Insert(v.symbol, v);
+ if (result.second && m_compressedChart) {
+ CompressedItem item;
+ item.end = end;
+ item.vertex = &(result.first->second);
+ (*m_compressedChart)[start][v.symbol[0]->GetId()].push_back(item);
+ }
+ return result.first->second;
+ }
+
+ const CompressedMatrix &GetCompressedMatrix(std::size_t start) const {
+ return (*m_compressedChart)[start];
+ }
+
+ private:
+ typedef std::vector<CompressedMatrix> CompressedChart;
+
+ std::vector<std::vector<Cell> > m_cells;
+ CompressedChart *m_compressedChart;
+};
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h b/moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h
new file mode 100644
index 000000000..dd0be3ae9
--- /dev/null
+++ b/moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "moses/Syntax/PHyperedge.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SHyperedgeBundle.h"
+
+#include "SChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Given a PHyperedge object and SChart produces a SHyperedgeBundle object.
+inline void PHyperedgeToSHyperedgeBundle(const PHyperedge &hyperedge,
+ const SChart &schart,
+ SHyperedgeBundle &bundle) {
+ bundle.translations = hyperedge.translations;
+ bundle.stacks.clear();
+ for (std::vector<PVertex*>::const_iterator p = hyperedge.tail.begin();
+ p != hyperedge.tail.end(); ++p) {
+ const PVertex *v = *p;
+ std::size_t spanStart = v->span.GetStartPos();
+ std::size_t spanEnd = v->span.GetEndPos();
+ const Word &symbol = v->symbol;
+ const SChart::Cell &cell = schart.GetCell(spanStart, spanEnd);
+ const SVertexStack *stack = 0;
+ if (symbol.IsNonTerminal()) {
+ stack = cell.nonTerminalStacks.Find(symbol);
+ } else {
+ const SChart::Cell::TMap::const_iterator q =
+ cell.terminalStacks.find(symbol);
+ assert(q != cell.terminalStacks.end());
+ stack = &(q->second);
+ }
+ bundle.stacks.push_back(stack);
+ }
+}
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/ParserCallback.h b/moses/Syntax/S2T/ParserCallback.h
new file mode 100644
index 000000000..b18a85eae
--- /dev/null
+++ b/moses/Syntax/S2T/ParserCallback.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include "moses/Syntax/BoundedPriorityContainer.h"
+#include "moses/Syntax/PHyperedge.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SHyperedgeBundle.h"
+#include "moses/Syntax/SHyperedgeBundleScorer.h"
+
+#include "PHyperedgeToSHyperedgeBundle.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class StandardParserCallback {
+ private:
+ typedef BoundedPriorityContainer<SHyperedgeBundle> Container;
+
+ public:
+ StandardParserCallback(const SChart &schart, std::size_t ruleLimit)
+ : m_schart(schart)
+ , m_container(ruleLimit) {}
+
+ void operator()(const PHyperedge &hyperedge) {
+ PHyperedgeToSHyperedgeBundle(hyperedge, m_schart, m_tmpBundle);
+ float score = SHyperedgeBundleScorer::Score(m_tmpBundle);
+ m_container.SwapIn(m_tmpBundle, score);
+ }
+
+ void InitForRange(const WordsRange &range) { m_container.LazyClear(); }
+
+ const Container &GetContainer() { return m_container; }
+
+ private:
+ const SChart &m_schart;
+ SHyperedgeBundle m_tmpBundle;
+ BoundedPriorityContainer<SHyperedgeBundle> m_container;
+};
+
+class EagerParserCallback {
+ private:
+ typedef BoundedPriorityContainer<SHyperedgeBundle> Container;
+
+ public:
+ EagerParserCallback(const SChart &schart, std::size_t ruleLimit)
+ : m_schart(schart)
+ , m_containers(schart.GetWidth(), Container(ruleLimit))
+ , m_prevStart(std::numeric_limits<std::size_t>::max()) {}
+
+ void operator()(const PHyperedge &hyperedge, std::size_t end) {
+ PHyperedgeToSHyperedgeBundle(hyperedge, m_schart, m_tmpBundle);
+ float score = SHyperedgeBundleScorer::Score(m_tmpBundle);
+ m_containers[end].SwapIn(m_tmpBundle, score);
+ }
+
+ void InitForRange(const WordsRange &range) {
+ const std::size_t start = range.GetStartPos();
+ m_end = range.GetEndPos();
+ if (start != m_prevStart) {
+ for (std::vector<Container>::iterator p = m_containers.begin();
+ p != m_containers.end(); ++p) {
+ p->LazyClear();
+ }
+ m_prevStart = start;
+ }
+ }
+
+ const Container &GetContainer() { return m_containers[m_end]; }
+
+ private:
+ const SChart &m_schart;
+ SHyperedgeBundle m_tmpBundle;
+ std::vector<Container> m_containers;
+ std::size_t m_end;
+ std::size_t m_prevStart;
+};
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/Parsers/Parser.h b/moses/Syntax/S2T/Parsers/Parser.h
new file mode 100644
index 000000000..b13a8d502
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Parser.h
@@ -0,0 +1,30 @@
+#pragma once
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class PChart;
+
+// Base class for parsers.
+template<typename Callback>
+class Parser
+{
+ public:
+ typedef Callback CallbackType;
+
+ Parser(PChart &chart) : m_chart(chart) {}
+
+ virtual ~Parser() {}
+
+ virtual void EnumerateHyperedges(const WordsRange &, Callback &) = 0;
+ protected:
+ PChart &m_chart;
+};
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h b/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h
new file mode 100644
index 000000000..b275a93ee
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include "moses/Syntax/S2T/PChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename Callback>
+RecursiveCYKPlusParser<Callback>::RecursiveCYKPlusParser(
+ PChart &chart,
+ const RuleTrie &trie,
+ std::size_t maxChartSpan)
+ : Parser<Callback>(chart)
+ , m_ruleTable(trie)
+ , m_maxChartSpan(maxChartSpan)
+ , m_callback(NULL)
+{
+ m_hyperedge.head = 0;
+}
+
+template<typename Callback>
+void RecursiveCYKPlusParser<Callback>::EnumerateHyperedges(
+ const WordsRange &range,
+ Callback &callback)
+{
+ const std::size_t start = range.GetStartPos();
+ const std::size_t end = range.GetEndPos();
+ m_callback = &callback;
+ const RuleTrie::Node &rootNode = m_ruleTable.GetRootNode();
+ m_maxEnd = std::min(Base::m_chart.GetWidth()-1, start+m_maxChartSpan-1);
+ m_hyperedge.tail.clear();
+
+ // Find all hyperedges where the first incoming vertex is a terminal covering
+ // [start,end].
+ GetTerminalExtension(rootNode, start, end);
+
+ // Find all hyperedges where the first incoming vertex is a non-terminal
+ // covering [start,end-1].
+ if (end > start) {
+ GetNonTerminalExtensions(rootNode, start, end-1, end-1);
+ }
+}
+
+// Search for all extensions of a partial rule (pointed at by node) that begin
+// with a non-terminal over a span between [start,minEnd] and [start,maxEnd].
+template<typename Callback>
+void RecursiveCYKPlusParser<Callback>::GetNonTerminalExtensions(
+ const RuleTrie::Node &node,
+ std::size_t start,
+ std::size_t minEnd,
+ std::size_t maxEnd) {
+ // Non-terminal labels in node's outgoing edge set.
+ const RuleTrie::Node::SymbolMap &nonTermMap = node.GetNonTerminalMap();
+
+ // Compressed matrix from PChart.
+ const PChart::CompressedMatrix &matrix =
+ Base::m_chart.GetCompressedMatrix(start);
+
+ // Loop over possible expansions of the rule.
+ RuleTrie::Node::SymbolMap::const_iterator p;
+ RuleTrie::Node::SymbolMap::const_iterator p_end = nonTermMap.end();
+ for (p = nonTermMap.begin(); p != p_end; ++p) {
+ const Word &nonTerm = p->first;
+ const std::vector<PChart::CompressedItem> &items =
+ matrix[nonTerm[0]->GetId()];
+ for (std::vector<PChart::CompressedItem>::const_iterator q = items.begin();
+ q != items.end(); ++q) {
+ if (q->end >= minEnd && q->end <= maxEnd) {
+ const RuleTrie::Node &child = p->second;
+ AddAndExtend(child, q->end, *(q->vertex));
+ }
+ }
+ }
+}
+
+// Search for all extensions of a partial rule (pointed at by node) that begin
+// with a terminal over span [start,end].
+template<typename Callback>
+void RecursiveCYKPlusParser<Callback>::GetTerminalExtension(
+ const RuleTrie::Node &node,
+ std::size_t start,
+ std::size_t end) {
+
+ const PChart::Cell::TMap &vertexMap =
+ Base::m_chart.GetCell(start, end).terminalVertices;
+ if (vertexMap.empty()) {
+ return;
+ }
+
+ const RuleTrie::Node::SymbolMap &terminals = node.GetTerminalMap();
+
+ for (PChart::Cell::TMap::const_iterator p = vertexMap.begin();
+ p != vertexMap.end(); ++p) {
+ const Word &terminal = p->first;
+ const PVertex &vertex = p->second;
+
+ // if node has small number of terminal edges, test word equality for each.
+ if (terminals.size() < 5) {
+ for (RuleTrie::Node::SymbolMap::const_iterator iter = terminals.begin();
+ iter != terminals.end(); ++iter) {
+ const Word &word = iter->first;
+ if (word == terminal) {
+ const RuleTrie::Node *child = & iter->second;
+ AddAndExtend(*child, end, vertex);
+ break;
+ }
+ }
+ } else { // else, do hash lookup
+ const RuleTrie::Node *child = node.GetChild(terminal);
+ if (child != NULL) {
+ AddAndExtend(*child, end, vertex);
+ }
+ }
+ }
+}
+
+// If a (partial) rule matches, pass it to the callback (if non-unary and
+// non-empty), and try to find expansions that have this partial rule as prefix.
+template<typename Callback>
+void RecursiveCYKPlusParser<Callback>::AddAndExtend(
+ const RuleTrie::Node &node,
+ std::size_t end,
+ const PVertex &vertex) {
+ // FIXME Sort out const-ness.
+ m_hyperedge.tail.push_back(const_cast<PVertex *>(&vertex));
+
+ // Add target phrase collection (except if rule is empty or unary).
+ const TargetPhraseCollection &tpc = node.GetTargetPhraseCollection();
+ if (!tpc.IsEmpty() && !IsNonLexicalUnary(m_hyperedge)) {
+ m_hyperedge.translations = &tpc;
+ (*m_callback)(m_hyperedge, end);
+ }
+
+ // Get all further extensions of rule (until reaching end of sentence or
+ // max-chart-span).
+ if (end < m_maxEnd) {
+ if (!node.GetTerminalMap().empty()) {
+ for (std::size_t newEndPos = end+1; newEndPos <= m_maxEnd; newEndPos++) {
+ GetTerminalExtension(node, end+1, newEndPos);
+ }
+ }
+ if (!node.GetNonTerminalMap().empty()) {
+ GetNonTerminalExtensions(node, end+1, end+1, m_maxEnd);
+ }
+ }
+
+ m_hyperedge.tail.pop_back();
+}
+
+template<typename Callback>
+bool RecursiveCYKPlusParser<Callback>::IsNonLexicalUnary(
+ const PHyperedge &hyperedge) const
+{
+ return hyperedge.tail.size() == 1 &&
+ hyperedge.tail[0]->symbol.IsNonTerminal();
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h b/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h
new file mode 100644
index 000000000..264d43eea
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "moses/Syntax/PHyperedge.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/S2T/Parsers/Parser.h"
+#include "moses/Syntax/S2T/RuleTrieCYKPlus.h"
+#include "moses/WordsRange.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Parser that implements the recursive variant of CYK+ from this paper:
+//
+// Rico Sennrich
+// "A CYK+ Variant for SCFG Decoding Without a Dot Chart"
+// In proceedings of SSST-8 2014
+//
+template<typename Callback>
+class RecursiveCYKPlusParser : public Parser<Callback>
+{
+ public:
+ typedef Parser<Callback> Base;
+ typedef RuleTrieCYKPlus RuleTrie;
+
+ // TODO Make this configurable?
+ static bool RequiresCompressedChart() { return true; }
+
+ RecursiveCYKPlusParser(PChart &, const RuleTrie &, std::size_t);
+
+ ~RecursiveCYKPlusParser() {}
+
+ void EnumerateHyperedges(const WordsRange &, Callback &);
+
+ private:
+
+ void GetTerminalExtension(const RuleTrie::Node &, std::size_t, std::size_t);
+
+ void GetNonTerminalExtensions(const RuleTrie::Node &, std::size_t,
+ std::size_t, std::size_t);
+
+ void AddAndExtend(const RuleTrie::Node &, std::size_t, const PVertex &);
+
+ bool IsNonLexicalUnary(const PHyperedge &) const;
+
+ const RuleTrie &m_ruleTable;
+ const std::size_t m_maxChartSpan;
+ std::size_t m_maxEnd;
+ PHyperedge m_hyperedge;
+ Callback *m_callback;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
+
+// Implementation
+#include "RecursiveCYKPlusParser-inl.h"
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h b/moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h
new file mode 100644
index 000000000..d55f7e842
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h
@@ -0,0 +1,185 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "moses/ChartParser.h"
+#include "moses/ChartTranslationOptionList.h"
+#include "moses/InputType.h"
+#include "moses/NonTerminal.h"
+#include "moses/StaticData.h"
+#include "moses/Syntax/S2T/Parsers/Parser.h"
+#include "moses/Syntax/S2T/PChart.h"
+
+#include "TailLatticeSearcher.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename Callback>
+Scope3Parser<Callback>::Scope3Parser(PChart &chart, const RuleTrie &trie,
+ std::size_t maxChartSpan)
+ : Parser<Callback>(chart)
+ , m_ruleTable(trie)
+ , m_maxChartSpan(maxChartSpan)
+ , m_latticeBuilder(chart)
+{
+ Init();
+}
+
+template<typename Callback>
+Scope3Parser<Callback>::~Scope3Parser()
+{
+ delete m_patRoot;
+}
+
+template<typename Callback>
+void Scope3Parser<Callback>::EnumerateHyperedges(const WordsRange &range,
+ Callback &callback)
+{
+ const std::size_t start = range.GetStartPos();
+ const std::size_t end = range.GetEndPos();
+
+ const std::vector<const PatternApplicationTrie *> &patNodes =
+ m_patSpans[start][end-start+1];
+
+ for (std::vector<const PatternApplicationTrie *>::const_iterator
+ p = patNodes.begin(); p != patNodes.end(); ++p) {
+ const PatternApplicationTrie *patNode = *p;
+
+ // Read off the sequence of PAT nodes ending at patNode.
+ patNode->ReadOffPatternApplicationKey(m_patKey);
+
+ // Calculate the start and end ranges for each symbol in the PAT key.
+ m_symbolRangeCalculator.Calc(m_patKey, start, end, m_symbolRanges);
+
+ // Build a lattice that encodes the set of PHyperedge tails that can be
+ // generated from this pattern + span.
+ m_latticeBuilder.Build(m_patKey, m_symbolRanges, m_lattice,
+ m_quickCheckTable);
+
+ // Ask the grammar for the mapping from label sequences to target phrase
+ // collections for this pattern.
+ const RuleTrie::Node::LabelMap &labelMap =
+ patNode->m_node->GetLabelMap();
+
+ // For each label sequence, search the lattice for the set of PHyperedge
+ // tails.
+ TailLatticeSearcher<Callback> searcher(m_lattice, m_patKey, m_symbolRanges);
+ RuleTrie::Node::LabelMap::const_iterator q = labelMap.begin();
+ for (; q != labelMap.end(); ++q) {
+ const std::vector<int> &labelSeq = q->first;
+ const TargetPhraseCollection &tpc = q->second;
+ // For many label sequences there won't be any corresponding paths through
+ // the lattice. As an optimisation, we use m_quickCheckTable to test
+ // for this and we don't begin a search if there are no paths to find.
+ bool failCheck = false;
+ std::size_t nonTermIndex = 0;
+ for (std::size_t i = 0; i < m_patKey.size(); ++i) {
+ if (m_patKey[i]->IsTerminalNode()) {
+ continue;
+ }
+ if (!m_quickCheckTable[nonTermIndex][labelSeq[nonTermIndex]]) {
+ failCheck = true;
+ break;
+ }
+ ++nonTermIndex;
+ }
+ if (failCheck) {
+ continue;
+ }
+ searcher.Search(labelSeq, tpc, callback);
+ }
+ }
+}
+
+template<typename Callback>
+void Scope3Parser<Callback>::Init()
+{
+ // Build a map from Words to PVertex sets.
+ SentenceMap sentMap;
+ FillSentenceMap(sentMap);
+
+ // Build the pattern application trie (PAT) for this input sentence.
+ const RuleTrie::Node &root = m_ruleTable.GetRootNode();
+ m_patRoot = new PatternApplicationTrie(-1, -1, root, 0, 0);
+ m_patRoot->Extend(root, -1, sentMap, false);
+
+ // Generate per-span lists of PAT node pointers.
+ InitRuleApplicationVector();
+ RecordPatternApplicationSpans(*m_patRoot);
+}
+
+/* TODO Rename */
+template<typename Callback>
+void Scope3Parser<Callback>::InitRuleApplicationVector()
+{
+ std::size_t length = Base::m_chart.GetWidth();
+ m_patSpans.resize(length);
+ for (std::size_t start = 0; start < length; ++start) {
+ std::size_t maxSpan = length-start;
+ m_patSpans[start].resize(maxSpan+1);
+ }
+}
+
+template<typename Callback>
+void Scope3Parser<Callback>::FillSentenceMap(SentenceMap &sentMap)
+{
+ typedef PChart::Cell Cell;
+
+ const std::size_t width = Base::m_chart.GetWidth();
+ for (std::size_t i = 0; i < width; ++i) {
+ for (std::size_t j = i; j < width; ++j) {
+ const Cell::TMap &map = Base::m_chart.GetCell(i, j).terminalVertices;
+ for (Cell::TMap::const_iterator p = map.begin(); p != map.end(); ++p) {
+ const Word &terminal = p->first;
+ const PVertex &v = p->second;
+ sentMap[terminal].push_back(&v);
+ }
+ }
+ }
+}
+
+template<typename Callback>
+void Scope3Parser<Callback>::RecordPatternApplicationSpans(
+ const PatternApplicationTrie &patNode)
+{
+ if (patNode.m_node->HasRules()) {
+ int s1 = -1;
+ int s2 = -1;
+ int e1 = -1;
+ int e2 = -1;
+ patNode.DetermineStartRange(Base::m_chart.GetWidth(), s1, s2);
+ patNode.DetermineEndRange(Base::m_chart.GetWidth(), e1, e2);
+
+ int minSpan = patNode.Depth();
+
+ // Add a PAT node pointer for each valid span in the range.
+ for (int i = s1; i <= s2; ++i) {
+ for (int j = std::max(e1, i+minSpan-1); j <= e2; ++j) {
+ std::size_t span = j-i+1;
+ assert(span >= 1);
+ if (span < minSpan) {
+ continue;
+ }
+ if (m_maxChartSpan && span > m_maxChartSpan) {
+ break;
+ }
+ m_patSpans[i][span].push_back(&patNode);
+ }
+ }
+ }
+
+ for (std::vector<PatternApplicationTrie*>::const_iterator p =
+ patNode.m_children.begin(); p != patNode.m_children.end(); ++p) {
+ RecordPatternApplicationSpans(**p);
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h b/moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h
new file mode 100644
index 000000000..d3104d9b1
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "moses/Syntax/S2T/Parsers/Parser.h"
+#include "moses/Syntax/S2T/RuleTrieScope3.h"
+#include "moses/WordsRange.h"
+
+#include "PatternApplicationTrie.h"
+#include "SymbolRangeCalculator.h"
+#include "TailLattice.h"
+#include "TailLatticeBuilder.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Parser that implements the algorithm described in this paper:
+//
+// Philip Williams and Philipp Koehn
+// "GHKM Rule Extraction and Scope-3 Parsing in Moses"
+// In proceedings of WMT 2012
+//
+template<typename Callback>
+class Scope3Parser : public Parser<Callback>
+{
+public:
+ typedef Parser<Callback> Base;
+ typedef RuleTrieScope3 RuleTrie;
+
+ // TODO Make this configurable?
+ static bool RequiresCompressedChart() { return false; }
+
+ Scope3Parser(PChart &, const RuleTrie &, std::size_t);
+
+ ~Scope3Parser();
+
+ void EnumerateHyperedges(const WordsRange &, Callback &);
+
+private:
+ void Init();
+ void InitRuleApplicationVector();
+ void FillSentenceMap(SentenceMap &);
+ void RecordPatternApplicationSpans(const PatternApplicationTrie &);
+
+ PatternApplicationTrie *m_patRoot;
+ std::vector<std::vector<bool> > m_quickCheckTable;
+ const RuleTrie &m_ruleTable;
+ const std::size_t m_maxChartSpan;
+ TailLattice m_lattice;
+ TailLatticeBuilder m_latticeBuilder;
+ SymbolRangeCalculator m_symbolRangeCalculator;
+ std::vector<SymbolRange> m_symbolRanges;
+ PatternApplicationKey m_patKey;
+
+ /* m_patSpans[i][j] records the set of all PAT nodes for span [i,i+j]
+ i.e. j is the width of the span */
+ std::vector<std::vector<
+ std::vector<const PatternApplicationTrie *> > > m_patSpans;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
+
+// Implementation
+#include "Parser-inl.h"
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp b/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp
new file mode 100644
index 000000000..218cd4017
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp
@@ -0,0 +1,190 @@
+#include "PatternApplicationTrie.h"
+
+#include "moses/Syntax/PVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+int PatternApplicationTrie::Depth() const {
+ if (m_parent) {
+ return m_parent->Depth() + 1;
+ }
+ return 0;
+}
+
+const PatternApplicationTrie *
+PatternApplicationTrie::GetHighestTerminalNode() const
+{
+ // Check if result has been cached.
+ if (m_highestTerminalNode) {
+ return m_highestTerminalNode;
+ }
+ // It doesn't really make sense to call this on the root node. Just return 0.
+ if (!m_parent) {
+ return 0;
+ }
+ // Is this the highest non-root node?
+ if (!m_parent->m_parent) {
+ if (IsTerminalNode()) {
+ m_highestTerminalNode = this;
+ return this;
+ } else {
+ return 0;
+ }
+ }
+ // This is not the highest non-root node, so ask parent node.
+ if (const PatternApplicationTrie *p = m_parent->GetHighestTerminalNode()) {
+ m_highestTerminalNode = p;
+ return p;
+ }
+ // There are no terminal nodes higher than this node.
+ if (IsTerminalNode()) {
+ m_highestTerminalNode = this;
+ }
+ return m_highestTerminalNode;
+}
+
+const PatternApplicationTrie *
+PatternApplicationTrie::GetLowestTerminalNode() const
+{
+ // Check if result has been cached.
+ if (m_lowestTerminalNode) {
+ return m_lowestTerminalNode;
+ }
+ // It doesn't really make sense to call this on the root node. Just return 0.
+ if (!m_parent) {
+ return 0;
+ }
+ // Is this a terminal node?
+ if (IsTerminalNode()) {
+ m_lowestTerminalNode = this;
+ return this;
+ }
+ // Is this the highest non-root node?
+ if (!m_parent->m_parent) {
+ return 0;
+ }
+ // Ask parent node.
+ return m_parent->GetLowestTerminalNode();
+}
+
+// A node corresponds to a rule pattern that has been partially applied to a
+// sentence (the terminals have fixed positions, but the spans of gap symbols
+// may be unknown). This function determines the range of possible start
+// values for the partially-applied pattern.
+void PatternApplicationTrie::DetermineStartRange(int sentenceLength,
+ int &minStart,
+ int &maxStart) const
+{
+ // Find the leftmost terminal symbol, if any.
+ const PatternApplicationTrie *n = GetHighestTerminalNode();
+ if (!n) {
+ // The pattern contains only gap symbols.
+ minStart = 0;
+ maxStart = sentenceLength-Depth();
+ return;
+ }
+ assert(n->m_parent);
+ if (!n->m_parent->m_parent) {
+ // The pattern begins with a terminal symbol so the start position is
+ // fixed.
+ minStart = n->m_start;
+ maxStart = n->m_start;
+ } else {
+ // The pattern begins with a gap symbol but it contains at least one
+ // terminal symbol. The maximum start position is the start position of
+ // the leftmost terminal minus one position for each leading gap symbol.
+ minStart = 0;
+ maxStart = n->m_start - (n->Depth()-1);
+ }
+}
+
+// A node corresponds to a rule pattern that has been partially applied to a
+// sentence (the terminals have fixed positions, but the spans of gap symbols
+// may be unknown). This function determines the range of possible end values
+// for the partially-applied pattern.
+void PatternApplicationTrie::DetermineEndRange(int sentenceLength,
+ int &minEnd,
+ int &maxEnd) const
+{
+ // Find the rightmost terminal symbol, if any.
+ const PatternApplicationTrie *n = GetLowestTerminalNode();
+ if (!n) {
+ // The pattern contains only gap symbols.
+ minEnd = Depth()-1;
+ maxEnd = sentenceLength-1;
+ return;
+ }
+ if (n == this) {
+ // The pattern ends with a terminal symbol so the end position is fixed.
+ minEnd = m_end;
+ maxEnd = m_end;
+ } else {
+ // The pattern ends with a gap symbol but it contains at least one terminal
+ // symbol. The minimum end position is the end position of the rightmost
+ // terminal + one position for each trailing gap symbol.
+ minEnd = n->m_end + (Depth()-n->Depth());
+ maxEnd = sentenceLength-1;
+ }
+}
+
+void PatternApplicationTrie::Extend(const RuleTrieScope3::Node &node,
+ int minPos, const SentenceMap &sentMap,
+ bool followsGap)
+{
+ const RuleTrieScope3::Node::TerminalMap &termMap = node.GetTerminalMap();
+ for (RuleTrieScope3::Node::TerminalMap::const_iterator p = termMap.begin();
+ p != termMap.end(); ++p) {
+ const Word &word = p->first;
+ const RuleTrieScope3::Node &child = p->second;
+ SentenceMap::const_iterator q = sentMap.find(word);
+ if (q == sentMap.end()) {
+ continue;
+ }
+ for (std::vector<const PVertex *>::const_iterator r = q->second.begin();
+ r != q->second.end(); ++r) {
+ const PVertex *v = *r;
+ std::size_t start = v->span.GetStartPos();
+ std::size_t end = v->span.GetEndPos();
+ if (start == (std::size_t)minPos ||
+ (followsGap && start > (std::size_t)minPos) ||
+ minPos == -1) {
+ PatternApplicationTrie *subTrie =
+ new PatternApplicationTrie(start, end, child, v, this);
+ subTrie->Extend(child, end+1, sentMap, false);
+ m_children.push_back(subTrie);
+ }
+ }
+ }
+
+ const RuleTrieScope3::Node *child = node.GetNonTerminalChild();
+ if (!child) {
+ return;
+ }
+ int start = followsGap ? -1 : minPos;
+ PatternApplicationTrie *subTrie =
+ new PatternApplicationTrie(start, -1, *child, 0, this);
+ int newMinPos = (minPos == -1 ? 1 : minPos+1);
+ subTrie->Extend(*child, newMinPos, sentMap, true);
+ m_children.push_back(subTrie);
+}
+
+void PatternApplicationTrie::ReadOffPatternApplicationKey(
+ PatternApplicationKey &key) const {
+ const int depth = Depth();
+ key.resize(depth);
+ const PatternApplicationTrie *p = this;
+ std::size_t i = depth-1;
+ while (p->m_parent != 0) {
+ key[i--] = p;
+ p = p->m_parent;
+ }
+}
+
+} // namespace S2T
+} // namespace Moses
+} // namespace Syntax
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h b/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h
new file mode 100644
index 000000000..0ad371367
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/Syntax/S2T/RuleTrieScope3.h"
+#include "moses/Util.h"
+
+#include "SentenceMap.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+struct PatternApplicationTrie;
+
+typedef std::vector<const PatternApplicationTrie*> PatternApplicationKey;
+
+struct PatternApplicationTrie {
+ public:
+ PatternApplicationTrie(int start, int end, const RuleTrieScope3::Node &node,
+ const PVertex *pvertex, PatternApplicationTrie *parent)
+ : m_start(start)
+ , m_end(end)
+ , m_node(&node)
+ , m_pvertex(pvertex)
+ , m_parent(parent)
+ , m_highestTerminalNode(0)
+ , m_lowestTerminalNode(0) {}
+
+ ~PatternApplicationTrie() {
+ RemoveAllInColl(m_children);
+ }
+
+ int Depth() const;
+
+ bool IsGapNode() const { return m_end == -1; }
+ bool IsTerminalNode() const { return m_end != -1; }
+
+ const PatternApplicationTrie *GetHighestTerminalNode() const;
+ const PatternApplicationTrie *GetLowestTerminalNode() const;
+
+ void DetermineStartRange(int, int &, int &) const;
+ void DetermineEndRange(int, int &, int &) const;
+
+ void Extend(const RuleTrieScope3::Node &node, int minPos,
+ const SentenceMap &sentMap, bool followsGap);
+
+ void ReadOffPatternApplicationKey(PatternApplicationKey &) const;
+
+ int m_start;
+ int m_end;
+ const RuleTrieScope3::Node *m_node;
+ const PVertex *m_pvertex;
+ PatternApplicationTrie *m_parent;
+ std::vector<PatternApplicationTrie*> m_children;
+ mutable const PatternApplicationTrie *m_highestTerminalNode;
+ mutable const PatternApplicationTrie *m_lowestTerminalNode;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h b/moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h
new file mode 100644
index 000000000..8e6aae9f1
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct PVertex;
+
+namespace S2T
+{
+
+// FIXME Check SymbolHasher does the right thing here
+typedef boost::unordered_map<Word, std::vector<const PVertex *>, SymbolHasher,
+ SymbolEqualityPred> SentenceMap;
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h
new file mode 100644
index 000000000..ccb0d6521
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h
@@ -0,0 +1,21 @@
+#pragma once
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Describes the range of possible start and end positions for a symbol
+// belonging to a node in a PatternApplicationTrie.
+struct SymbolRange {
+ int minStart;
+ int maxStart;
+ int minEnd;
+ int maxEnd;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp
new file mode 100644
index 000000000..0eb615db8
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp
@@ -0,0 +1,160 @@
+#include "SymbolRangeCalculator.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+void SymbolRangeCalculator::Calc(const PatternApplicationKey &key,
+ int spanStart, int spanEnd,
+ std::vector<SymbolRange> &ranges)
+{
+ FillInTerminalRanges(key, ranges);
+ FillInAuxSymbolInfo(ranges);
+ FillInGapRanges(key, spanStart, spanEnd, ranges);
+}
+
+// Fill in ranges for terminals and set ranges to -1 for non-terminals.
+void SymbolRangeCalculator::FillInTerminalRanges(
+ const PatternApplicationKey &key, std::vector<SymbolRange> &ranges)
+{
+ ranges.resize(key.size());
+ for (std::size_t i = 0; i < key.size(); ++i) {
+ const PatternApplicationTrie *patNode = key[i];
+ if (patNode->IsTerminalNode()) {
+ ranges[i].minStart = ranges[i].maxStart = patNode->m_start;
+ ranges[i].minEnd = ranges[i].maxEnd = patNode->m_end;
+ } else {
+ ranges[i].minStart = ranges[i].maxStart = -1;
+ ranges[i].minEnd = ranges[i].maxEnd = -1;
+ }
+ }
+}
+
+void SymbolRangeCalculator::FillInAuxSymbolInfo(
+ const std::vector<SymbolRange> &ranges)
+{
+ m_auxSymbolInfo.resize(ranges.size());
+
+ // Forward pass: set distanceToPrevTerminal.
+ int distanceToPrevTerminal = -1;
+ for (std::size_t i = 0; i < ranges.size(); ++i) {
+ const SymbolRange &range = ranges[i];
+ AuxSymbolInfo &auxInfo = m_auxSymbolInfo[i];
+ if (range.minStart != -1) {
+ // Symbol i is a terminal.
+ assert(range.maxStart == range.minStart);
+ distanceToPrevTerminal = 1;
+ // Distances are not used for terminals so set auxInfo value to -1.
+ auxInfo.distanceToPrevTerminal = -1;
+ } else if (distanceToPrevTerminal == -1) {
+ // Symbol i is a non-terminal and there are no preceding terminals.
+ auxInfo.distanceToPrevTerminal = -1;
+ } else {
+ // Symbol i is a non-terminal and there is a preceding terminal.
+ auxInfo.distanceToPrevTerminal = distanceToPrevTerminal++;
+ }
+ }
+
+ // Backward pass: set distanceToNextTerminal
+ int distanceToNextTerminal = -1;
+ for (std::size_t j = ranges.size(); j > 0; --j) {
+ std::size_t i = j-1;
+ const SymbolRange &range = ranges[i];
+ AuxSymbolInfo &auxInfo = m_auxSymbolInfo[i];
+ if (range.minStart != -1) {
+ // Symbol i is a terminal.
+ assert(range.maxStart == range.minStart);
+ distanceToNextTerminal = 1;
+ // Distances are not used for terminals so set auxInfo value to -1.
+ auxInfo.distanceToNextTerminal = -1;
+ } else if (distanceToNextTerminal == -1) {
+ // Symbol i is a non-terminal and there are no succeeding terminals.
+ auxInfo.distanceToNextTerminal = -1;
+ } else {
+ // Symbol i is a non-terminal and there is a succeeding terminal.
+ auxInfo.distanceToNextTerminal = distanceToNextTerminal++;
+ }
+ }
+}
+
+void SymbolRangeCalculator::FillInGapRanges(const PatternApplicationKey &key,
+ int spanStart, int spanEnd,
+ std::vector<SymbolRange> &ranges)
+{
+ for (std::size_t i = 0; i < key.size(); ++i) {
+ const PatternApplicationTrie *patNode = key[i];
+
+ if (patNode->IsTerminalNode()) {
+ continue;
+ }
+
+ SymbolRange &range = ranges[i];
+ AuxSymbolInfo &auxInfo = m_auxSymbolInfo[i];
+
+ // Determine minimum start position.
+ if (auxInfo.distanceToPrevTerminal == -1) {
+ // There are no preceding terminals in pattern.
+ range.minStart = spanStart + i;
+ } else {
+ // There is at least one preceeding terminal in the pattern.
+ int j = i - auxInfo.distanceToPrevTerminal;
+ assert(ranges[j].minEnd == ranges[j].maxEnd);
+ range.minStart = ranges[j].maxEnd + auxInfo.distanceToPrevTerminal;
+ }
+
+ // Determine maximum start position.
+ if (i == 0) {
+ // Gap is leftmost symbol in pattern.
+ range.maxStart = spanStart;
+ } else if (auxInfo.distanceToPrevTerminal == 1) {
+ // Gap follows terminal so start position is fixed.
+ range.maxStart = ranges[i-1].maxEnd + 1;
+ } else if (auxInfo.distanceToNextTerminal == -1) {
+ // There are no succeeding terminals in the pattern.
+ int numFollowingGaps = (ranges.size()-1) - i;
+ range.maxStart = spanEnd - numFollowingGaps;
+ } else {
+ // There is at least one succeeding terminal in the pattern.
+ int j = i + auxInfo.distanceToNextTerminal;
+ range.maxStart = ranges[j].minStart - auxInfo.distanceToNextTerminal;
+ }
+
+ // Determine minimum end position.
+ if (i+1 == key.size()) {
+ // Gap is rightmost symbol in pattern.
+ range.minEnd = spanEnd;
+ } else if (auxInfo.distanceToNextTerminal == 1) {
+ // Gap immediately precedes terminal.
+ range.minEnd = ranges[i+1].minStart - 1;
+ } else if (auxInfo.distanceToPrevTerminal == -1) {
+ // There are no preceding terminals in pattern.
+ range.minEnd = spanStart + i;
+ } else {
+ // There is at least one preceeding terminal in the pattern.
+ int j = i - auxInfo.distanceToPrevTerminal;
+ assert(ranges[j].minEnd == ranges[j].maxEnd);
+ range.minEnd = ranges[j].maxEnd + auxInfo.distanceToPrevTerminal;
+ }
+
+ // Determine maximum end position.
+ if (i+1 == key.size()) {
+ // Gap is rightmost symbol in pattern.
+ range.maxEnd = spanEnd;
+ } else if (auxInfo.distanceToNextTerminal == -1) {
+ // There are no succeeding terminals in the pattern.
+ int numFollowingGaps = (ranges.size()-1) - i;
+ range.maxEnd = spanEnd - numFollowingGaps;
+ } else {
+ // There is at least one succeeding terminal in the pattern.
+ int j = i + auxInfo.distanceToNextTerminal;
+ range.maxEnd = ranges[j].minStart - auxInfo.distanceToNextTerminal;
+ }
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h
new file mode 100644
index 000000000..341fb9bb4
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <vector>
+
+#include "PatternApplicationTrie.h"
+#include "SymbolRange.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class SymbolRangeCalculator
+{
+ public:
+ void Calc(const PatternApplicationKey &, int, int,
+ std::vector<SymbolRange> &);
+
+ private:
+ // Provides contextual information used in determining a symbol's range.
+ struct AuxSymbolInfo {
+ int distanceToNextTerminal;
+ int distanceToPrevTerminal;
+ };
+
+ void FillInTerminalRanges(const PatternApplicationKey &,
+ std::vector<SymbolRange> &);
+
+ void FillInAuxSymbolInfo(const std::vector<SymbolRange> &);
+
+ void FillInGapRanges(const PatternApplicationKey &, int, int,
+ std::vector<SymbolRange> &);
+
+ std::vector<AuxSymbolInfo> m_auxSymbolInfo;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h
new file mode 100644
index 000000000..9ee16b186
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <utility>
+#include <vector>
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+/* Lattice in which a full path corresponds to the tail of a PHyperedge.
+ * For an entry x[i][j][k][l] in a TailLattice x:
+ *
+ * i = offset from start of rule pattern
+ *
+ * j = index of gap + 1 (zero indicates a terminal, otherwise the index is
+ * zero-based from the left of the rule pattern)
+ *
+ * k = arc width
+ *
+ * l = label index (zero for terminals, otherwise as in RuleTrieScope3::Node)
+ */
+typedef std::vector<
+ std::vector<
+ std::vector<
+ std::vector<const PVertex *> > > > TailLattice;
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp
new file mode 100644
index 000000000..6b31090fc
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp
@@ -0,0 +1,131 @@
+#include "TailLatticeBuilder.h"
+
+#include "moses/Syntax/S2T/RuleTrieScope3.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+void TailLatticeBuilder::Build(
+ const std::vector<const PatternApplicationTrie *> &key,
+ const std::vector<SymbolRange> &ranges,
+ TailLattice &lattice,
+ std::vector<std::vector<bool> > &checkTable)
+{
+ assert(key.size() == ranges.size());
+ assert(key.size() > 0);
+
+ ExtendAndClear(key, ranges, lattice, checkTable);
+
+ const int spanStart = ranges.front().minStart;
+
+ const RuleTrieScope3::Node *utrieNode = key.back()->m_node;
+
+ const RuleTrieScope3::Node::LabelTable &labelTable =
+ utrieNode->GetLabelTable();
+
+ std::size_t nonTermIndex = 0;
+
+ for (std::size_t i = 0; i < ranges.size(); ++i) {
+ const SymbolRange &range = ranges[i];
+ const PatternApplicationTrie &patNode = *(key[i]);
+ if (patNode.IsTerminalNode()) {
+ std::size_t offset = range.minStart - spanStart;
+ std::size_t width = range.minEnd - range.minStart + 1;
+ assert(lattice[offset][0][width].empty());
+ lattice[offset][0][width].push_back(patNode.m_pvertex);
+ continue;
+ }
+ const std::vector<Word> &labelVec = labelTable[nonTermIndex];
+ assert(checkTable[nonTermIndex].size() == labelVec.size());
+ for (int s = range.minStart; s <= range.maxStart; ++s) {
+ for (int e = std::max(s, range.minEnd); e <= range.maxEnd; ++e) {
+ assert(e-s >= 0);
+ std::size_t offset = s - spanStart;
+ std::size_t width = e - s + 1;
+ assert(lattice[offset][nonTermIndex+1][width].empty());
+ std::vector<bool>::iterator q = checkTable[nonTermIndex].begin();
+ for (std::vector<Word>::const_iterator p = labelVec.begin();
+ p != labelVec.end(); ++p, ++q) {
+ const Word &label = *p;
+ const PVertex *v =
+ m_chart.GetCell(s, e).nonTerminalVertices.Find(label);
+ lattice[offset][nonTermIndex+1][width].push_back(v);
+ *q = (*q || static_cast<bool>(v));
+ }
+ }
+ }
+ ++nonTermIndex;
+ }
+}
+
+// Extend the lattice if necessary and clear the innermost vectors.
+void TailLatticeBuilder::ExtendAndClear(
+ const std::vector<const PatternApplicationTrie *> &key,
+ const std::vector<SymbolRange> &ranges,
+ TailLattice &lattice,
+ std::vector<std::vector<bool> > &checkTable)
+{
+ const int spanStart = ranges.front().minStart;
+ const int spanEnd = ranges.back().maxEnd;
+
+ const std::size_t span = spanEnd - spanStart + 1;
+
+ // Extend the outermost vector.
+ if (lattice.size() < span) {
+ lattice.resize(span);
+ }
+
+ const RuleTrieScope3::Node *utrieNode = key.back()->m_node;
+ const RuleTrieScope3::Node::LabelTable &labelTable =
+ utrieNode->GetLabelTable();
+
+ std::size_t nonTermIndex = 0;
+
+ for (std::size_t i = 0; i < ranges.size(); ++i) {
+ const SymbolRange &range = ranges[i];
+ const PatternApplicationTrie &patNode = *(key[i]);
+ if (patNode.IsTerminalNode()) {
+ std::size_t offset = range.minStart - spanStart;
+ std::size_t width = range.minEnd - range.minStart + 1;
+ if (lattice[offset].size() < 1) {
+ lattice[offset].resize(1);
+ }
+ if (lattice[offset][0].size() < width+1) {
+ lattice[offset][0].resize(width+1);
+ }
+ lattice[offset][0][width].clear();
+ continue;
+ }
+ const std::vector<Word> &labelVec = labelTable[nonTermIndex];
+ for (int s = range.minStart; s <= range.maxStart; ++s) {
+ for (int e = std::max(s, range.minEnd); e <= range.maxEnd; ++e) {
+ assert(e-s >= 0);
+ std::size_t offset = s - spanStart;
+ std::size_t width = e - s + 1;
+ if (lattice[offset].size() < nonTermIndex+2) {
+ lattice[offset].resize(nonTermIndex+2);
+ }
+ if (lattice[offset][nonTermIndex+1].size() < width+1) {
+ lattice[offset][nonTermIndex+1].resize(width+1);
+ }
+ lattice[offset][nonTermIndex+1][width].clear();
+ lattice[offset][nonTermIndex+1][width].reserve(labelVec.size());
+ }
+ }
+ if (checkTable.size() < nonTermIndex+1) {
+ checkTable.resize(nonTermIndex+1);
+ }
+ // Unlike the lattice itself, the check table must contain initial
+ // values prior to the main build procedure (and the values must be false).
+ checkTable[nonTermIndex].assign(labelVec.size(), false);
+ ++nonTermIndex;
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h
new file mode 100644
index 000000000..c61df8a40
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/Syntax/S2T/PChart.h"
+
+#include "PatternApplicationTrie.h"
+#include "SymbolRange.h"
+#include "TailLattice.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class TailLatticeBuilder
+{
+ public:
+ TailLatticeBuilder(PChart &chart) : m_chart(chart) {}
+
+ // Given a key from a PatternApplicationTrie and the valid ranges of its
+ // symbols, construct a TailLattice.
+ void Build(const std::vector<const PatternApplicationTrie *> &,
+ const std::vector<SymbolRange> &,
+ TailLattice &, std::vector<std::vector<bool> > &);
+
+ private:
+ // Auxiliary function used by Build. Enlarges a TailLattice, if necessary,
+ // and clears the innermost vectors.
+ void ExtendAndClear(const std::vector<const PatternApplicationTrie *> &,
+ const std::vector<SymbolRange> &,
+ TailLattice &, std::vector<std::vector<bool> > &);
+
+ PChart &m_chart;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h
new file mode 100644
index 000000000..a2897ce73
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "moses/Syntax/PHyperedge.h"
+
+#include "TailLattice.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename Callback>
+class TailLatticeSearcher
+{
+ public:
+ TailLatticeSearcher(const TailLattice &lattice,
+ const PatternApplicationKey &key,
+ const std::vector<SymbolRange> &ranges)
+ : m_lattice(lattice)
+ , m_key(key)
+ , m_ranges(ranges) {}
+
+ void Search(const std::vector<int> &labels, const TargetPhraseCollection &tpc,
+ Callback &callback) {
+ m_labels = &labels;
+ m_matchCB = &callback;
+ m_hyperedge.head = 0;
+ m_hyperedge.tail.clear();
+ m_hyperedge.translations = &tpc;
+ SearchInner(0, 0, 0);
+ }
+
+ private:
+ void SearchInner(int offset, std::size_t i, std::size_t nonTermIndex) {
+ assert(m_hyperedge.tail.size() == i);
+
+ const PatternApplicationTrie *patNode = m_key[i];
+ const SymbolRange &range = m_ranges[i];
+
+ if (patNode->IsTerminalNode()) {
+ const int width = range.minEnd - range.minStart + 1;
+ const PVertex *v = m_lattice[offset][0][width][0];
+ // FIXME Sort out const-ness
+ m_hyperedge.tail.push_back(const_cast<PVertex*>(v));
+ if (i == m_key.size()-1) {
+ (*m_matchCB)(m_hyperedge);
+ } else {
+ SearchInner(offset+width, i+1, nonTermIndex);
+ }
+ m_hyperedge.tail.pop_back();
+ return;
+ }
+
+ const int absStart = m_ranges[0].minStart + offset;
+ const int minWidth = std::max(1, range.minEnd - absStart + 1);
+ const int maxWidth = range.maxEnd - absStart + 1;
+
+ const std::vector<std::vector<const PVertex *> > &innerVec =
+ m_lattice[offset][nonTermIndex+1];
+
+ std::size_t labelIndex = (*m_labels)[nonTermIndex];
+
+ // Loop over all possible widths for this offset and index.
+ for (std::size_t width = minWidth; width <= maxWidth; ++width) {
+ const PVertex *v = innerVec[width][labelIndex];
+ if (!v) {
+ continue;
+ }
+ // FIXME Sort out const-ness
+ m_hyperedge.tail.push_back(const_cast<PVertex*>(v));
+ if (i == m_key.size()-1) {
+ (*m_matchCB)(m_hyperedge);
+ } else {
+ SearchInner(offset+width, i+1, nonTermIndex+1);
+ }
+ m_hyperedge.tail.pop_back();
+ }
+ }
+
+ const TailLattice &m_lattice;
+ const PatternApplicationKey &m_key;
+ const std::vector<SymbolRange> &m_ranges;
+ const std::vector<int> *m_labels;
+ Callback *m_matchCB;
+ PHyperedge m_hyperedge;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrie.h b/moses/Syntax/S2T/RuleTrie.h
new file mode 100644
index 000000000..8f6dcbb80
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrie.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <cstddef>
+
+#include "moses/Syntax/RuleTable.h"
+
+namespace Moses
+{
+
+class Phrase;
+class TargetPhrase;
+class TargetPhraseCollection;
+class Word;
+
+namespace Syntax
+{
+namespace S2T
+{
+
+// Base class for parser-specific trie types.
+class RuleTrie : public RuleTable
+{
+ public:
+ RuleTrie(const RuleTableFF *ff) : RuleTable(ff) {}
+
+ virtual bool HasPreterminalRule(const Word &) const = 0;
+
+ private:
+ friend class RuleTrieCreator;
+
+ virtual TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS) = 0;
+
+ virtual void SortAndPrune(std::size_t) = 0;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieCYKPlus.cpp b/moses/Syntax/S2T/RuleTrieCYKPlus.cpp
new file mode 100644
index 000000000..9a300e9eb
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieCYKPlus.cpp
@@ -0,0 +1,151 @@
+#include "RuleTrieCYKPlus.h"
+
+#include <map>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+
+#include "moses/NonTerminal.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+void RuleTrieCYKPlus::Node::Prune(std::size_t tableLimit)
+{
+ // recusively prune
+ for (SymbolMap::iterator p = m_sourceTermMap.begin();
+ p != m_sourceTermMap.end(); ++p) {
+ p->second.Prune(tableLimit);
+ }
+ for (SymbolMap::iterator p = m_nonTermMap.begin();
+ p != m_nonTermMap.end(); ++p) {
+ p->second.Prune(tableLimit);
+ }
+
+ // prune TargetPhraseCollection in this node
+ m_targetPhraseCollection.Prune(true, tableLimit);
+}
+
+void RuleTrieCYKPlus::Node::Sort(std::size_t tableLimit)
+{
+ // recusively sort
+ for (SymbolMap::iterator p = m_sourceTermMap.begin();
+ p != m_sourceTermMap.end(); ++p) {
+ p->second.Sort(tableLimit);
+ }
+ for (SymbolMap::iterator p = m_nonTermMap.begin();
+ p != m_nonTermMap.end(); ++p) {
+ p->second.Sort(tableLimit);
+ }
+
+ // prune TargetPhraseCollection in this node
+ m_targetPhraseCollection.Sort(true, tableLimit);
+}
+
+RuleTrieCYKPlus::Node *RuleTrieCYKPlus::Node::GetOrCreateChild(
+ const Word &sourceTerm)
+{
+ return &m_sourceTermMap[sourceTerm];
+}
+
+RuleTrieCYKPlus::Node *RuleTrieCYKPlus::Node::GetOrCreateNonTerminalChild(const Word &targetNonTerm)
+{
+ UTIL_THROW_IF2(!targetNonTerm.IsNonTerminal(),
+ "Not a non-terminal: " << targetNonTerm);
+
+ return &m_nonTermMap[targetNonTerm];
+}
+
+const RuleTrieCYKPlus::Node *RuleTrieCYKPlus::Node::GetChild(
+ const Word &sourceTerm) const
+{
+ UTIL_THROW_IF2(sourceTerm.IsNonTerminal(),
+ "Not a terminal: " << sourceTerm);
+
+ SymbolMap::const_iterator p = m_sourceTermMap.find(sourceTerm);
+ return (p == m_sourceTermMap.end()) ? NULL : &p->second;
+}
+
+const RuleTrieCYKPlus::Node *RuleTrieCYKPlus::Node::GetNonTerminalChild(
+ const Word &targetNonTerm) const
+{
+ UTIL_THROW_IF2(!targetNonTerm.IsNonTerminal(),
+ "Not a non-terminal: " << targetNonTerm);
+
+ SymbolMap::const_iterator p = m_nonTermMap.find(targetNonTerm);
+ return (p == m_nonTermMap.end()) ? NULL : &p->second;
+}
+
+TargetPhraseCollection &RuleTrieCYKPlus::GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
+{
+ Node &currNode = GetOrCreateNode(source, target, sourceLHS);
+ return currNode.GetTargetPhraseCollection();
+}
+
+RuleTrieCYKPlus::Node &RuleTrieCYKPlus::GetOrCreateNode(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
+{
+ const std::size_t size = source.GetSize();
+
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+ AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
+
+ Node *currNode = &m_root;
+ for (std::size_t pos = 0 ; pos < size ; ++pos) {
+ const Word& word = source.GetWord(pos);
+
+ if (word.IsNonTerminal()) {
+ // indexed by source label 1st
+ const Word &sourceNonTerm = word;
+
+ UTIL_THROW_IF2(iterAlign == alignmentInfo.end(),
+ "No alignment for non-term at position " << pos);
+ UTIL_THROW_IF2(iterAlign->first != pos,
+ "Alignment info incorrect at position " << pos);
+
+ std::size_t targetNonTermInd = iterAlign->second;
+ ++iterAlign;
+ const Word &targetNonTerm = target.GetWord(targetNonTermInd);
+ currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
+ } else {
+ currNode = currNode->GetOrCreateChild(word);
+ }
+
+ UTIL_THROW_IF2(currNode == NULL,
+ "Node not found at position " << pos);
+ }
+
+ // finally, the source LHS
+ //currNode = currNode->GetOrCreateChild(sourceLHS);
+
+ return *currNode;
+}
+
+void RuleTrieCYKPlus::SortAndPrune(std::size_t tableLimit)
+{
+ if (tableLimit) {
+ m_root.Sort(tableLimit);
+ }
+}
+
+bool RuleTrieCYKPlus::HasPreterminalRule(const Word &w) const
+{
+ const Node::SymbolMap &map = m_root.GetTerminalMap();
+ Node::SymbolMap::const_iterator p = map.find(w);
+ return p != map.end() && p->second.HasRules();
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieCYKPlus.h b/moses/Syntax/S2T/RuleTrieCYKPlus.h
new file mode 100644
index 000000000..83ea55b87
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieCYKPlus.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Terminal.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+
+#include "RuleTrie.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class RuleTrieCYKPlus : public RuleTrie
+{
+ public:
+ class Node
+ {
+ public:
+ typedef boost::unordered_map<Word, Node, SymbolHasher,
+ SymbolEqualityPred> SymbolMap;
+
+ bool IsLeaf() const {
+ return m_sourceTermMap.empty() && m_nonTermMap.empty();
+ }
+
+ bool HasRules() const { return !m_targetPhraseCollection.IsEmpty(); }
+
+ void Prune(std::size_t tableLimit);
+ void Sort(std::size_t tableLimit);
+
+ Node *GetOrCreateChild(const Word &sourceTerm);
+ Node *GetOrCreateNonTerminalChild(const Word &targetNonTerm);
+
+ const Node *GetChild(const Word &sourceTerm) const;
+ const Node *GetNonTerminalChild(const Word &targetNonTerm) const;
+
+ const TargetPhraseCollection &GetTargetPhraseCollection() const {
+ return m_targetPhraseCollection;
+ }
+
+ TargetPhraseCollection &GetTargetPhraseCollection() {
+ return m_targetPhraseCollection;
+ }
+
+ const SymbolMap &GetTerminalMap() const { return m_sourceTermMap; }
+
+ const SymbolMap &GetNonTerminalMap() const { return m_nonTermMap; }
+
+ private:
+ SymbolMap m_sourceTermMap;
+ SymbolMap m_nonTermMap;
+ TargetPhraseCollection m_targetPhraseCollection;
+ };
+
+ RuleTrieCYKPlus(const RuleTableFF *ff) : RuleTrie(ff) {}
+
+ const Node &GetRootNode() const { return m_root; }
+
+ bool HasPreterminalRule(const Word &) const;
+
+ private:
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
+
+ Node &GetOrCreateNode(const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS);
+
+ void SortAndPrune(std::size_t);
+
+ Node m_root;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieCreator.h b/moses/Syntax/S2T/RuleTrieCreator.h
new file mode 100644
index 000000000..1fe99e609
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieCreator.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "RuleTrie.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Base for classes that create a RuleTrie (currently RuleTrieLoader and
+// OovHandler). RuleTrieCreator is a friend of RuleTrie.
+class RuleTrieCreator
+{
+ protected:
+ // Provide access to RuleTrie's private SortAndPrune function.
+ void SortAndPrune(RuleTrie &trie, std::size_t limit) {
+ trie.SortAndPrune(limit);
+ }
+
+ // Provide access to RuleTrie's private GetOrCreateTargetPhraseCollection
+ // function.
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ RuleTrie &trie, const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS) {
+ return trie.GetOrCreateTargetPhraseCollection(source, target, sourceLHS);
+ }
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieLoader.cpp b/moses/Syntax/S2T/RuleTrieLoader.cpp
new file mode 100644
index 000000000..8efa4969b
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieLoader.cpp
@@ -0,0 +1,156 @@
+#include "RuleTrieLoader.h"
+
+#include <sys/stat.h>
+#include <stdlib.h>
+
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <iostream>
+#include <math.h>
+
+#include "moses/FactorCollection.h"
+#include "moses/Word.h"
+#include "moses/Util.h"
+#include "moses/InputFileStream.h"
+#include "moses/StaticData.h"
+#include "moses/WordsRange.h"
+#include "moses/UserMessage.h"
+#include "moses/ChartTranslationOptionList.h"
+#include "moses/FactorCollection.h"
+#include "moses/Syntax/RuleTableFF.h"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/double-conversion/double-conversion.h"
+#include "util/exception.hh"
+
+#include "RuleTrie.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+bool RuleTrieLoader::Load(const std::vector<FactorType> &input,
+ const std::vector<FactorType> &output,
+ const std::string &inFile,
+ const RuleTableFF &ff,
+ RuleTrie &trie)
+{
+ PrintUserTime(std::string("Start loading text phrase table. Moses format"));
+
+ const StaticData &staticData = StaticData::Instance();
+ const std::string &factorDelimiter = staticData.GetFactorDelimiter();
+
+ std::size_t count = 0;
+
+ std::ostream *progress = NULL;
+ IFVERBOSE(1) progress = &std::cerr;
+ util::FilePiece in(inFile.c_str(), progress);
+
+ // reused variables
+ std::vector<float> scoreVector;
+ StringPiece line;
+
+ double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
+
+ while(true) {
+ try {
+ line = in.ReadLine();
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
+
+ util::TokenIter<util::MultiCharacter> pipes(line, "|||");
+ StringPiece sourcePhraseString(*pipes);
+ StringPiece targetPhraseString(*++pipes);
+ StringPiece scoreString(*++pipes);
+
+ StringPiece alignString;
+ if (++pipes) {
+ StringPiece temp(*pipes);
+ alignString = temp;
+ }
+
+ if (++pipes) {
+ StringPiece str(*pipes); //counts
+ }
+
+ bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
+ if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
+ TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
+ continue;
+ }
+
+ scoreVector.clear();
+ for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+ int processed;
+ float score = converter.StringToFloat(s->data(), s->length(), &processed);
+ UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
+ scoreVector.push_back(FloorScore(TransformScore(score)));
+ }
+ const size_t numScoreComponents = ff.GetNumScoreComponents();
+ if (scoreVector.size() != numScoreComponents) {
+ UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
+ << numScoreComponents << ") of score components on line " << count);
+ }
+
+ // parse source & find pt node
+
+ // constituent labels
+ Word *sourceLHS = NULL;
+ Word *targetLHS;
+
+ // create target phrase obj
+ TargetPhrase *targetPhrase = new TargetPhrase(&ff);
+ // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
+ targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
+ // source
+ Phrase sourcePhrase;
+ // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
+ sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
+
+ // rest of target phrase
+ targetPhrase->SetAlignmentInfo(alignString);
+ targetPhrase->SetTargetLHS(targetLHS);
+
+ //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
+
+ if (++pipes) {
+ StringPiece sparseString(*pipes);
+ targetPhrase->SetSparseScore(&ff, sparseString);
+ }
+
+ if (++pipes) {
+ StringPiece propertiesString(*pipes);
+ targetPhrase->SetProperties(propertiesString);
+ }
+
+ targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
+ targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());
+
+ TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(
+ trie, sourcePhrase, *targetPhrase, sourceLHS);
+ phraseColl.Add(targetPhrase);
+
+ // not implemented correctly in memory pt. just delete it for now
+ delete sourceLHS;
+
+ count++;
+ }
+
+ // sort and prune each target phrase collection
+ if (ff.GetTableLimit()) {
+ SortAndPrune(trie, ff.GetTableLimit());
+ }
+
+ return true;
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieLoader.h b/moses/Syntax/S2T/RuleTrieLoader.h
new file mode 100644
index 000000000..c625f91d6
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieLoader.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <istream>
+#include <vector>
+
+#include "moses/TypeDef.h"
+#include "moses/Syntax/RuleTableFF.h"
+
+#include "RuleTrie.h"
+#include "RuleTrieCreator.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class RuleTrieLoader : public RuleTrieCreator
+{
+ public:
+ bool Load(const std::vector<FactorType> &input,
+ const std::vector<FactorType> &output,
+ const std::string &inFile,
+ const RuleTableFF &,
+ RuleTrie &);
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieScope3.cpp b/moses/Syntax/S2T/RuleTrieScope3.cpp
new file mode 100644
index 000000000..a16cbefdc
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieScope3.cpp
@@ -0,0 +1,153 @@
+#include "RuleTrieScope3.h"
+
+#include <map>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+
+#include "moses/NonTerminal.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+void RuleTrieScope3::Node::Prune(std::size_t tableLimit)
+{
+ // Recusively prune child node values.
+ for (TerminalMap::iterator p = m_terminalMap.begin();
+ p != m_terminalMap.end(); ++p) {
+ p->second.Prune(tableLimit);
+ }
+ if (m_gapNode) {
+ m_gapNode->Prune(tableLimit);
+ }
+
+ // Prune TargetPhraseCollections at this node.
+ for (LabelMap::iterator p = m_labelMap.begin(); p != m_labelMap.end(); ++p) {
+ p->second.Prune(true, tableLimit);
+ }
+}
+
+void RuleTrieScope3::Node::Sort(std::size_t tableLimit)
+{
+ // Recusively sort child node values.
+ for (TerminalMap::iterator p = m_terminalMap.begin();
+ p != m_terminalMap.end(); ++p) {
+ p->second.Sort(tableLimit);
+ }
+ if (m_gapNode) {
+ m_gapNode->Sort(tableLimit);
+ }
+
+ // Sort TargetPhraseCollections at this node.
+ for (LabelMap::iterator p = m_labelMap.begin(); p != m_labelMap.end(); ++p) {
+ p->second.Sort(true, tableLimit);
+ }
+}
+
+RuleTrieScope3::Node *RuleTrieScope3::Node::GetOrCreateTerminalChild(
+ const Word &sourceTerm)
+{
+ assert(!sourceTerm.IsNonTerminal());
+ std::pair<TerminalMap::iterator, bool> result;
+ result = m_terminalMap.insert(std::make_pair(sourceTerm, Node()));
+ const TerminalMap::iterator &iter = result.first;
+ Node &child = iter->second;
+ return &child;
+}
+
+RuleTrieScope3::Node *RuleTrieScope3::Node::GetOrCreateNonTerminalChild(
+ const Word &targetNonTerm)
+{
+ assert(targetNonTerm.IsNonTerminal());
+ if (m_gapNode == NULL) {
+ m_gapNode = new Node();
+ }
+ return m_gapNode;
+}
+
+TargetPhraseCollection &
+RuleTrieScope3::Node::GetOrCreateTargetPhraseCollection(
+ const TargetPhrase &target)
+{
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+ const std::size_t rank = alignmentInfo.GetSize();
+
+ std::vector<int> vec;
+ vec.reserve(rank);
+
+ m_labelTable.resize(rank);
+
+ int i = 0;
+ for (AlignmentInfo::const_iterator p = alignmentInfo.begin();
+ p != alignmentInfo.end(); ++p) {
+ std::size_t targetNonTermIndex = p->second;
+ const Word &targetNonTerm = target.GetWord(targetNonTermIndex);
+ vec.push_back(InsertLabel(i++, targetNonTerm));
+ }
+
+ return m_labelMap[vec];
+}
+
+TargetPhraseCollection &RuleTrieScope3::GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
+{
+ Node &currNode = GetOrCreateNode(source, target, sourceLHS);
+ return currNode.GetOrCreateTargetPhraseCollection(target);
+}
+
+RuleTrieScope3::Node &RuleTrieScope3::GetOrCreateNode(
+ const Phrase &source, const TargetPhrase &target, const Word */*sourceLHS*/)
+{
+ const std::size_t size = source.GetSize();
+
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+ AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
+
+ Node *currNode = &m_root;
+ for (std::size_t pos = 0 ; pos < size ; ++pos) {
+ const Word &word = source.GetWord(pos);
+
+ if (word.IsNonTerminal()) {
+ assert(iterAlign != alignmentInfo.end());
+ assert(iterAlign->first == pos);
+ std::size_t targetNonTermInd = iterAlign->second;
+ ++iterAlign;
+ const Word &targetNonTerm = target.GetWord(targetNonTermInd);
+ currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
+ } else {
+ currNode = currNode->GetOrCreateTerminalChild(word);
+ }
+
+ assert(currNode != NULL);
+ }
+
+ return *currNode;
+}
+
+void RuleTrieScope3::SortAndPrune(std::size_t tableLimit)
+{
+ if (tableLimit) {
+ m_root.Sort(tableLimit);
+ }
+}
+
+bool RuleTrieScope3::HasPreterminalRule(const Word &w) const
+{
+ const Node::TerminalMap &map = m_root.GetTerminalMap();
+ Node::TerminalMap::const_iterator p = map.find(w);
+ return p != map.end() && p->second.HasRules();
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieScope3.h b/moses/Syntax/S2T/RuleTrieScope3.h
new file mode 100644
index 000000000..6dd38a4f1
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieScope3.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+
+#include "RuleTrie.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class RuleTrieScope3 : public RuleTrie
+{
+ public:
+ class Node
+ {
+ public:
+ typedef std::vector<std::vector<Word> > LabelTable;
+
+ typedef boost::unordered_map<Word, Node, SymbolHasher,
+ SymbolEqualityPred> TerminalMap;
+
+ typedef boost::unordered_map<std::vector<int>,
+ TargetPhraseCollection> LabelMap;
+
+ ~Node() { delete m_gapNode; }
+
+ const LabelTable &GetLabelTable() const { return m_labelTable; }
+
+ const LabelMap &GetLabelMap() const { return m_labelMap; }
+
+ const TerminalMap &GetTerminalMap() const { return m_terminalMap; }
+
+ const Node *GetNonTerminalChild() const { return m_gapNode; }
+
+ Node *GetOrCreateTerminalChild(const Word &sourceTerm);
+
+ Node *GetOrCreateNonTerminalChild(const Word &targetNonTerm);
+
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ const TargetPhrase &);
+
+ bool IsLeaf() const { return m_terminalMap.empty() && m_gapNode == NULL; }
+
+ bool HasRules() const { return !m_labelMap.empty(); }
+
+ void Prune(std::size_t tableLimit);
+ void Sort(std::size_t tableLimit);
+
+ private:
+ friend class RuleTrieScope3;
+
+ Node() : m_gapNode(NULL) {}
+
+ int InsertLabel(int i, const Word &w) {
+ std::vector<Word> &inner = m_labelTable[i];
+ for (std::size_t j = 0; j < inner.size(); ++j) {
+ if (inner[j] == w) {
+ return j;
+ }
+ }
+ inner.push_back(w);
+ return inner.size()-1;
+ }
+
+ LabelTable m_labelTable;
+ LabelMap m_labelMap;
+ TerminalMap m_terminalMap;
+ Node *m_gapNode;
+ };
+
+ RuleTrieScope3(const RuleTableFF *ff) : RuleTrie(ff) {}
+
+ const Node &GetRootNode() const { return m_root; }
+
+ bool HasPreterminalRule(const Word &) const;
+
+ private:
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
+
+ Node &GetOrCreateNode(const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS);
+
+ void SortAndPrune(std::size_t);
+
+ Node m_root;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/SChart.cpp b/moses/Syntax/S2T/SChart.cpp
new file mode 100644
index 000000000..f47d6efdb
--- /dev/null
+++ b/moses/Syntax/S2T/SChart.cpp
@@ -0,0 +1,20 @@
+#include "SChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+SChart::SChart(std::size_t width)
+{
+ m_cells.resize(width);
+ for (std::size_t i = 0; i < width; ++i) {
+ m_cells[i].resize(width);
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/SChart.h b/moses/Syntax/S2T/SChart.h
new file mode 100644
index 000000000..62b7d0c2b
--- /dev/null
+++ b/moses/Syntax/S2T/SChart.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "moses/Syntax/NonTerminalMap.h"
+#include "moses/Syntax/SVertexStack.h"
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class SChart
+{
+ public:
+ struct Cell
+ {
+ typedef boost::unordered_map<Word, SVertexStack, SymbolHasher,
+ SymbolEqualityPred> TMap;
+ typedef NonTerminalMap<SVertexStack> NMap;
+ TMap terminalStacks;
+ NMap nonTerminalStacks;
+ };
+
+ SChart(std::size_t width);
+
+ std::size_t GetWidth() const { return m_cells.size(); }
+
+ const Cell &GetCell(std::size_t start, std::size_t end) const {
+ return m_cells[start][end];
+ }
+
+ Cell &GetCell(std::size_t start, std::size_t end) {
+ return m_cells[start][end];
+ }
+
+ private:
+ std::vector<std::vector<Cell> > m_cells;
+};
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SHyperedge.cpp b/moses/Syntax/SHyperedge.cpp
new file mode 100644
index 000000000..0f098c7a4
--- /dev/null
+++ b/moses/Syntax/SHyperedge.cpp
@@ -0,0 +1,59 @@
+#include "SHyperedge.h"
+
+#include "moses/StaticData.h"
+
+#include "SVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+Phrase GetOneBestTargetYield(const SHyperedge &h)
+{
+ FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
+
+ Phrase ret(ARRAY_SIZE_INCR);
+
+ const AlignmentInfo::NonTermIndexMap &targetToSourceMap =
+ h.translation->GetAlignNonTerm().GetNonTermIndexMap2();
+
+ for (std::size_t pos = 0; pos < h.translation->GetSize(); ++pos) {
+ const Word &word = h.translation->GetWord(pos);
+ if (word.IsNonTerminal()) {
+ std::size_t sourceIndex = targetToSourceMap[pos];
+ const SHyperedge &incoming = *h.tail[sourceIndex]->best;
+ Phrase subPhrase = GetOneBestTargetYield(incoming);
+ ret.Append(subPhrase);
+ } else {
+ ret.AddWord(word);
+ if (placeholderFactor == NOT_FOUND) {
+ continue;
+ }
+ assert(false);
+ // FIXME Modify this chunk of code to work for SHyperedge.
+/*
+ std::set<std::size_t> sourcePosSet =
+ h.translation->GetAlignTerm().GetAlignmentsForTarget(pos);
+ if (sourcePosSet.size() == 1) {
+ const std::vector<const Word*> *ruleSourceFromInputPath =
+ hypo.GetTranslationOption().GetSourceRuleFromInputPath();
+ UTIL_THROW_IF2(ruleSourceFromInputPath == NULL,
+ "Source Words in of the rules hasn't been filled out");
+ std::size_t sourcePos = *sourcePosSet.begin();
+ const Word *sourceWord = ruleSourceFromInputPath->at(sourcePos);
+ UTIL_THROW_IF2(sourceWord == NULL,
+ "Null source word at position " << sourcePos);
+ const Factor *factor = sourceWord->GetFactor(placeholderFactor);
+ if (factor) {
+ ret.Back()[0] = factor;
+ }
+ }
+*/
+ }
+ }
+ return ret;
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SHyperedge.h b/moses/Syntax/SHyperedge.h
new file mode 100644
index 000000000..6d9128d49
--- /dev/null
+++ b/moses/Syntax/SHyperedge.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/Phrase.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/TargetPhrase.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct SVertex;
+
+struct SHyperedge
+{
+ SVertex *head;
+ std::vector<SVertex*> tail;
+ float score;
+ ScoreComponentCollection scoreBreakdown;
+ const TargetPhrase *translation;
+};
+
+Phrase GetOneBestTargetYield(const SHyperedge &h);
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SHyperedgeBundle.h b/moses/Syntax/SHyperedgeBundle.h
new file mode 100644
index 000000000..4a78c5458
--- /dev/null
+++ b/moses/Syntax/SHyperedgeBundle.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/ScoreComponentCollection.h"
+#include "moses/TargetPhraseCollection.h"
+
+#include "SVertexStack.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct PVertex;
+
+struct SHyperedgeBundle
+{
+ std::vector<const SVertexStack*> stacks;
+ const TargetPhraseCollection *translations;
+
+ friend void swap(SHyperedgeBundle &x, SHyperedgeBundle &y) {
+ using std::swap;
+ swap(x.stacks, y.stacks);
+ swap(x.translations, y.translations);
+ }
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SHyperedgeBundleScorer.h b/moses/Syntax/SHyperedgeBundleScorer.h
new file mode 100644
index 000000000..3bf547cfd
--- /dev/null
+++ b/moses/Syntax/SHyperedgeBundleScorer.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "SHyperedgeBundle.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct SHyperedgeBundleScorer
+{
+ public:
+ static float Score(const SHyperedgeBundle &bundle) {
+ const TargetPhrase &targetPhrase = **(bundle.translations->begin());
+ float score = targetPhrase.GetFutureScore();
+ for (std::vector<const SVertexStack*>::const_iterator p =
+ bundle.stacks.begin(); p != bundle.stacks.end(); ++p) {
+ const SVertexStack *stack = *p;
+ if (stack->front()->best) {
+ score += stack->front()->best->score;
+ }
+ }
+ return score;
+ }
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SVertex.cpp b/moses/Syntax/SVertex.cpp
new file mode 100644
index 000000000..32650b2a8
--- /dev/null
+++ b/moses/Syntax/SVertex.cpp
@@ -0,0 +1,28 @@
+#include "SVertex.h"
+
+#include "moses/FF/FFState.h"
+
+#include "SHyperedge.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+SVertex::~SVertex()
+{
+ // Delete incoming SHyperedge objects.
+ delete best;
+ for (std::vector<SHyperedge*>::iterator p = recombined.begin();
+ p != recombined.end(); ++p) {
+ delete *p;
+ }
+ // Delete FFState objects.
+ for (std::vector<FFState*>::iterator p = state.begin();
+ p != state.end(); ++p) {
+ delete *p;
+ }
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SVertex.h b/moses/Syntax/SVertex.h
new file mode 100644
index 000000000..cde14c21a
--- /dev/null
+++ b/moses/Syntax/SVertex.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <vector>
+
+namespace Moses
+{
+
+class FFState;
+
+namespace Syntax
+{
+
+struct PVertex;
+struct SHyperedge;
+
+// A vertex in the search hypergraph.
+//
+// Important: a SVertex owns its incoming SHyperedge objects and its FFState
+// objects and will delete them on destruction.
+struct SVertex
+{
+ ~SVertex();
+
+ SHyperedge *best;
+ std::vector<SHyperedge*> recombined;
+ const PVertex *pvertex;
+ std::vector<FFState*> state;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SVertexRecombinationOrderer.h b/moses/Syntax/SVertexRecombinationOrderer.h
new file mode 100644
index 000000000..60686d989
--- /dev/null
+++ b/moses/Syntax/SVertexRecombinationOrderer.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "moses/FF/FFState.h"
+
+#include "SVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct SVertexRecombinationOrderer
+{
+ public:
+ bool operator()(const SVertex &x, const SVertex &y) const
+ {
+ int comp = 0;
+ for (std::size_t i = 0; i < x.state.size(); ++i) {
+ if (x.state[i] == NULL || y.state[i] == NULL) {
+ comp = x.state[i] - y.state[i];
+ } else {
+ comp = x.state[i]->Compare(*y.state[i]);
+ }
+ if (comp != 0) {
+ return comp < 0;
+ }
+ }
+ return false;
+ }
+
+ bool operator()(const SVertex *x, const SVertex *y) const
+ {
+ return operator()(*x, *y);
+ }
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SVertexStack.h b/moses/Syntax/SVertexStack.h
new file mode 100644
index 000000000..57dc9f247
--- /dev/null
+++ b/moses/Syntax/SVertexStack.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+
+#include "SHyperedge.h"
+#include "SVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+typedef std::vector<boost::shared_ptr<SVertex> > SVertexStack;
+
+struct SVertexStackContentOrderer
+{
+ public:
+ bool operator()(const boost::shared_ptr<SVertex> &x,
+ const boost::shared_ptr<SVertex> &y)
+ {
+ return x->best->score > y->best->score;
+ }
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SymbolEqualityPred.h b/moses/Syntax/SymbolEqualityPred.h
new file mode 100644
index 000000000..e97c4f11b
--- /dev/null
+++ b/moses/Syntax/SymbolEqualityPred.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "moses/Factor.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// Assumes that only the first factor is relevant. i.e. factored decoding will
+// *not* work in moses_chart unless this is changed (among other things).
+class SymbolEqualityPred
+{
+ public:
+ bool operator()(const Word &s1, const Word &s2) const {
+ const Factor *f1 = s1[0];
+ const Factor *f2 = s2[0];
+ return !(f1->Compare(*f2));
+ }
+};
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/SymbolHasher.h b/moses/Syntax/SymbolHasher.h
new file mode 100644
index 000000000..b398fdd00
--- /dev/null
+++ b/moses/Syntax/SymbolHasher.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <boost/functional/hash.hpp>
+
+#include "moses/Factor.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// Assumes that only the first factor is relevant. i.e. factored decoding will
+// *not* work in moses_chart unless this is changed (among other things).
+class SymbolHasher
+{
+ public:
+ std::size_t operator()(const Word &s) const {
+ const Factor *f = s[0];
+ return hash_value(*f);
+ }
+};
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp
index 30419e9c7..9fb33d2a9 100644
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@@ -177,18 +177,18 @@ void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
}
-void TargetPhrase::SetAlignTerm(const AlignmentInfo::CollType &coll)
-{
- const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
- m_alignTerm = alignmentInfo;
-
-}
-
-void TargetPhrase::SetAlignNonTerm(const AlignmentInfo::CollType &coll)
-{
- const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
- m_alignNonTerm = alignmentInfo;
-}
+// void TargetPhrase::SetAlignTerm(const AlignmentInfo::CollType &coll)
+// {
+// const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
+// m_alignTerm = alignmentInfo;
+
+// }
+
+// void TargetPhrase::SetAlignNonTerm(const AlignmentInfo::CollType &coll)
+// {
+// const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
+// m_alignNonTerm = alignmentInfo;
+// }
void TargetPhrase::SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString)
{
diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h
index d23e946c0..35d27feea 100644
--- a/moses/TargetPhrase.h
+++ b/moses/TargetPhrase.h
@@ -28,6 +28,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Phrase.h"
#include "ScoreComponentCollection.h"
#include "AlignmentInfo.h"
+#include "AlignmentInfoCollection.h"
#include "moses/PP/PhraseProperty.h"
#include "util/string_piece.hh"
@@ -121,8 +122,24 @@ public:
m_alignNonTerm = alignNonTerm;
}
- void SetAlignTerm(const AlignmentInfo::CollType &coll);
- void SetAlignNonTerm(const AlignmentInfo::CollType &coll);
+ // ALNREP = alignment representation,
+ // see AlignmentInfo constructors for supported representations
+ template<typename ALNREP>
+ void
+ SetAlignTerm(const ALNREP &coll)
+ {
+ m_alignTerm = AlignmentInfoCollection::Instance().Add(coll);
+ }
+
+ // ALNREP = alignment representation,
+ // see AlignmentInfo constructors for supported representations
+ template<typename ALNREP>
+ void
+ SetAlignNonTerm(const ALNREP &coll)
+ {
+ m_alignNonTerm = AlignmentInfoCollection::Instance().Add(coll);
+ }
+
const AlignmentInfo &GetAlignTerm() const {
return *m_alignTerm;
diff --git a/moses/ThreadPool.cpp b/moses/ThreadPool.cpp
index 9d0cdd06b..265c150c2 100644
--- a/moses/ThreadPool.cpp
+++ b/moses/ThreadPool.cpp
@@ -55,8 +55,11 @@ void ThreadPool::Execute()
}
//Execute job
if (task) {
+ // must read from task before run. otherwise task may be deleted by main thread
+ // race condition
+ bool del = task->DeleteAfterExecution();
task->Run();
- if (task->DeleteAfterExecution()) {
+ if (del) {
delete task;
}
}
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp
index cfdbc3aa6..d93632b7e 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.cpp
+++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp
@@ -206,12 +206,9 @@ LoadCorpus(FactorDirection direction,
int sntIdx(0);
// corpus.seekg(0); Seems needless -> commented out to allow
// loading of gzipped corpora (gzfilebuf doesn't support seeking).
- const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
while(getline(corpus, line)) {
sntArray.push_back(sntIdx);
Phrase phrase(ARRAY_SIZE_INCR);
- // parse phrase
- // phrase.CreateFromString( direction, factors, line, factorDelimiter, NULL);
phrase.CreateFromString( direction, factors, line, NULL);
// store words in vocabulary and corpus
for( size_t i = 0; i < phrase.GetSize(); ++i) {
@@ -499,11 +496,9 @@ addSntPair(string& source, string& target, string& alignment)
vuint_t srcFactor, trgFactor;
cerr << "source, target, alignment = " << source << ", "
<< target << ", " << alignment << endl;
- const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
Phrase sphrase(ARRAY_SIZE_INCR);
- // sphrase.CreateFromString(Input, m_inputFactors, source, factorDelimiter, NULL);
sphrase.CreateFromString(Input, m_inputFactors, source, NULL);
m_srcVocab->MakeOpen();
vector<wordID_t> sIDs(sphrase.GetSize());
@@ -519,7 +514,6 @@ addSntPair(string& source, string& target, string& alignment)
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
m_srcVocab->MakeClosed();
Phrase tphrase(ARRAY_SIZE_INCR);
- // tphrase.CreateFromString(Output, m_outputFactors, target, factorDelimiter, NULL);
tphrase.CreateFromString(Output, m_outputFactors, target, NULL);
m_trgVocab->MakeOpen();
vector<wordID_t> tIDs(tphrase.GetSize());
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
index c9508873b..ac6522ef5 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
@@ -154,7 +154,6 @@ void ChartRuleLookupManagerMemory::UpdateCompressedMatrix(size_t startPos,
#if !defined(UNLABELLED_SOURCE)
// source non-terminal labels for the span
const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
- const std::vector<bool> &sourceNonTermArray = inputPath.GetNonTerminalArray();
// can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
if (inputPath.GetNonTerminalSet().size() == 0) {
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
index 010608665..784d31deb 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
@@ -154,7 +154,6 @@ void ChartRuleLookupManagerMemoryPerSentence::UpdateCompressedMatrix(size_t star
#if !defined(UNLABELLED_SOURCE)
// source non-terminal labels for the span
const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
- const std::vector<bool> &sourceNonTermArray = inputPath.GetNonTerminalArray();
// can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
if (inputPath.GetNonTerminalSet().size() == 0) {
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp
index 163be8937..01f90fd82 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp
@@ -39,14 +39,12 @@ ChartRuleLookupManagerOnDisk::ChartRuleLookupManagerOnDisk(
const PhraseDictionaryOnDisk &dictionary,
OnDiskPt::OnDiskWrapper &dbWrapper,
const std::vector<FactorType> &inputFactorsVec,
- const std::vector<FactorType> &outputFactorsVec,
- const std::string &filePath)
+ const std::vector<FactorType> &outputFactorsVec)
: ChartRuleLookupManagerCYKPlus(parser, cellColl)
, m_dictionary(dictionary)
, m_dbWrapper(dbWrapper)
, m_inputFactorsVec(inputFactorsVec)
, m_outputFactorsVec(outputFactorsVec)
- , m_filePath(filePath)
{
UTIL_THROW_IF2(m_expandableDottedRuleListVec.size() != 0,
"Dotted rule collection not correctly initialized");
@@ -177,7 +175,6 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
bool doSearch = true;
if (m_dictionary.m_maxSpanDefault != NOT_FOUND) {
// for Hieu's source syntax
- const Word &targetLHS = cellLabel.GetLabel();
bool isSourceSyntaxNonTerm = sourceLHS != defaultSourceNonTerm;
size_t nonTermNumWordsCovered = endPos - startPos + 1;
@@ -186,8 +183,6 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
nonTermNumWordsCovered <= m_dictionary.m_maxSpanLabelled :
nonTermNumWordsCovered <= m_dictionary.m_maxSpanDefault;
- //cerr << "sourceLHS=" << sourceLHS << " targetLHS=" << targetLHS
- // << "doSearch=" << doSearch << endl;
}
if (doSearch) {
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
index 6213d3b67..6f2f71cdd 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
@@ -41,8 +41,7 @@ public:
const PhraseDictionaryOnDisk &dictionary,
OnDiskPt::OnDiskWrapper &dbWrapper,
const std::vector<FactorType> &inputFactorsVec,
- const std::vector<FactorType> &outputFactorsVec,
- const std::string &filePath);
+ const std::vector<FactorType> &outputFactorsVec);
~ChartRuleLookupManagerOnDisk();
@@ -55,7 +54,6 @@ private:
OnDiskPt::OnDiskWrapper &m_dbWrapper;
const std::vector<FactorType> &m_inputFactorsVec;
const std::vector<FactorType> &m_outputFactorsVec;
- const std::string &m_filePath;
std::vector<DottedRuleStackOnDisk*> m_expandableDottedRuleListVec;
std::map<UINT64, const TargetPhraseCollection*> m_cache;
std::list<const OnDiskPt::PhraseNode*> m_sourcePhraseNode;
diff --git a/moses/TranslationModel/CompactPT/MurmurHash3.cpp b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
index d16cd9502..4e4cd4258 100644
--- a/moses/TranslationModel/CompactPT/MurmurHash3.cpp
+++ b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
@@ -29,7 +29,7 @@
#else // defined(_MSC_VER)
-#define FORCE_INLINE __attribute__((always_inline))
+#define FORCE_INLINE inline __attribute__((always_inline))
inline uint32_t rotl32 ( uint32_t x, int8_t r )
{
diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
index 30f43a4fe..3bf0d2820 100644
--- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
@@ -717,7 +717,7 @@ std::string PhraseTableCreator::EncodeLine(std::vector<std::string>& tokens, siz
std::stringstream strme;
strme << "Error: Wrong number of scores detected ("
<< scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
- strme << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[3] << " ..." << std::endl;
+ strme << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[2] << " ..." << std::endl;
UTIL_THROW2(strme.str());
}
@@ -1146,7 +1146,7 @@ void EncodingTask::operator()()
UTIL_THROW2(strme.str());
}
- if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
+ if(tokens.size() > 3 && tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
std::stringstream strme;
strme << "Error: It seems the following line contains no alignment information, " << std::endl;
strme << "but you are using ";
diff --git a/moses/TranslationModel/DynSAInclude/params.cpp b/moses/TranslationModel/DynSAInclude/params.cpp
index 27cf06a51..fa99c4838 100644
--- a/moses/TranslationModel/DynSAInclude/params.cpp
+++ b/moses/TranslationModel/DynSAInclude/params.cpp
@@ -123,11 +123,12 @@ bool Parameters::setParamValue(const std::string& name, const std::string& val)
std::string Parameters::getParamValue(const std::string& name)
{
std::string value = Parameters::kNotSetValue;
- if(isValidParamName(name))
+ if(isValidParamName(name)) {
if(params_.find(name) != params_.end())
value = params_[name].value;
else if(getValueType(name) == kBoolValue)
value = kFalseValue;
+ }
return value;
}
std::string Parameters::getParam(const std::string& name)
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
index 6a3174a59..9a9739de6 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
@@ -31,7 +31,7 @@ void OutputVec(const vector<T> &vec)
}
// from phrase-extract/tables-core.cpp
-vector<string> tokenize( const char* input )
+inline vector<string> tokenize( const char* input )
{
vector< string > token;
bool betweenWords = true;
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
index 722035d1e..c948b66b2 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
@@ -30,8 +30,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/UserMessage.h"
#include <exception>
-extern std::vector<std::string> tokenize( const char*);
-
namespace Moses
{
diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
index f46d3a440..fe87594fa 100644
--- a/moses/TranslationModel/RuleTable/LoaderStandard.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
@@ -152,7 +152,6 @@ bool RuleTableLoaderStandard::Load(FormatType format
PrintUserTime(string("Start loading text phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");
const StaticData &staticData = StaticData::Instance();
- const std::string& factorDelimiter = staticData.GetFactorDelimiter();
string lineOrig;
size_t count = 0;
@@ -223,11 +222,9 @@ bool RuleTableLoaderStandard::Load(FormatType format
// create target phrase obj
TargetPhrase *targetPhrase = new TargetPhrase(&ruleTable);
- // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
// source
Phrase sourcePhrase;
- // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
// rest of target phrase
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
index 81fd43fcb..821b81c51 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
@@ -49,7 +49,7 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
bool ret = loader->Load(m_input, m_output, grammarFile, m_tableLimit,
*this);
- UTIL_THROW_IF2(ret == NULL,
+ UTIL_THROW_IF2(!ret,
"Rules not successfully loaded for sentence id " << translationId);
}
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
index 48ed91e4b..ba1971e3a 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
@@ -212,7 +212,6 @@ void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSenten
PrintUserTime("Start loading fuzzy-match phrase model");
const StaticData &staticData = StaticData::Instance();
- const std::string& factorDelimiter = staticData.GetFactorDelimiter();
string lineOrig;
@@ -266,12 +265,10 @@ void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSenten
// source
Phrase sourcePhrase( 0);
- // sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS);
sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS);
// create target phrase obj
TargetPhrase *targetPhrase = new TargetPhrase(this);
- // targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS);
targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS);
// rest of target phrase
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
index 581842494..4d3f9fbd3 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
@@ -59,7 +59,7 @@ ChartRuleLookupManager *PhraseDictionaryOnDisk::CreateRuleLookupManager(
return new ChartRuleLookupManagerOnDisk(parser, cellCollection, *this,
GetImplementation(),
m_input,
- m_output, m_filePath);
+ m_output);
}
OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation()
diff --git a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
index 6f1abed9e..cfc86b8fc 100644
--- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
@@ -52,12 +52,13 @@ namespace ugdiss
VAL operator[](ID key) const;
};
- Cell* data;
- VAL *M1, *M2;
- OFFSET * index;
+ Cell const* data;
+ VAL const* M1;
+ VAL const* M2;
+ OFFSET const* index;
ID numRows;
ID numCols;
- boost::shared_ptr<bio::mapped_file> file;
+ boost::shared_ptr<bio::mapped_file_source> file;
VAL m1(ID key) const
{
@@ -120,8 +121,8 @@ namespace ugdiss
string foo = msg.str();
UTIL_THROW(util::Exception,foo.c_str());
}
- file.reset(new bio::mapped_file());
- file->open(fname,ios::in|ios::out);
+ file.reset(new bio::mapped_file_source());
+ file->open(fname);
if (!file->is_open())
{
ostringstream msg;
@@ -130,14 +131,14 @@ namespace ugdiss
string foo = msg.str();
UTIL_THROW(util::Exception,foo.c_str());
}
- char* p = file->data();
- filepos_type offset = *reinterpret_cast<filepos_type*>(p);
- index = reinterpret_cast<OFFSET*>(p+offset); p += sizeof(offset);
+ char const* p = file->data();
+ filepos_type offset = *reinterpret_cast<filepos_type const*>(p);
+ index = reinterpret_cast<OFFSET const*>(p+offset); p += sizeof(offset);
numRows = *reinterpret_cast<ID const*>(p); p += sizeof(id_type);
numCols = *reinterpret_cast<ID const*>(p); p += sizeof(id_type);
- data = reinterpret_cast<Cell*>(p);
+ data = reinterpret_cast<Cell const*>(p);
// cout << numRows << " rows; " << numCols << " columns " << endl;
- M1 = reinterpret_cast<VAL*>(index+numRows+1);
+ M1 = reinterpret_cast<VAL const*>(index+numRows+1);
M2 = M1+numRows;
// cout << "Table " << fname << " has " << numRows << " rows and "
// << numCols << " columns." << endl;
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index 45c9fde6b..5b52161ca 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -508,6 +508,7 @@ namespace Moses
Word w; w.CreateFromString(Output,ofactor,wrd,false);
tp->AddWord(w);
}
+ tp->SetAlignTerm(pool.aln);
tp->GetScoreBreakdown().Assign(this, fvals);
tp->EvaluateInIsolation(src);
return tp;
diff --git a/moses/TranslationModel/UG/sim-pe.cc b/moses/TranslationModel/UG/sim-pe.cc
index 58a70cab4..460d66c1f 100644
--- a/moses/TranslationModel/UG/sim-pe.cc
+++ b/moses/TranslationModel/UG/sim-pe.cc
@@ -42,7 +42,8 @@ translate(string const& source)
istringstream ibuf(source+"\n");
sentence.Read(ibuf,ifo);
- Manager manager(lineNumber, sentence, global.GetSearchAlgorithm());
+ // Manager manager(lineNumber, sentence, global.GetSearchAlgorithm());
+ Manager manager(sentence, global.GetSearchAlgorithm());
manager.ProcessSentence();
ostringstream obuf;
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index a91c58343..26dce03d0 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -323,17 +323,10 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
// do not try to find the best ... report multiple matches
if (multiple_flag) {
- int input_letter_length = compute_length( input[sentenceInd] );
for(int si=0; si<best_tm.size(); si++) {
int s = best_tm[si];
string path;
- unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
- // do not report multiple identical sentences, but just their count
- //cout << sentenceInd << " "; // sentence number
- //cout << letter_cost << "/" << input_letter_length << " ";
- //cout << "(" << best_cost <<"/" << input_length <<") ";
- //cout << "||| " << s << " ||| " << path << endl;
-
+ sed( input[sentenceInd], source[s], path, true );
const vector<WORD_ID> &sourceSentence = source[s];
vector<SentenceAlignment> &targets = targetAndAlignment[s];
create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
@@ -707,7 +700,7 @@ unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentenc
/* brute force method: compare input to all corpus sentences */
-int FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
+void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
vector< vector< WORD_ID > > input )
{
// go through input set...
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
index 6405ae566..da50b64b9 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
@@ -60,8 +60,8 @@ protected:
void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
/** brute force method: compare input to all corpus sentences */
- int basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
- std::vector< std::vector< tmmt::WORD_ID > > input ) ;
+ void basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
+ std::vector< std::vector< tmmt::WORD_ID > > input ) ;
/** utlility function: compute length of sentence in characters
(spaces do not count) */
diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp
new file mode 100644
index 000000000..6accf8f2e
--- /dev/null
+++ b/moses/TranslationTask.cpp
@@ -0,0 +1,434 @@
+#include "TranslationTask.h"
+#include "moses/StaticData.h"
+#include "moses/Sentence.h"
+#include "moses/IOWrapper.h"
+#include "moses/TranslationAnalysis.h"
+#include "moses/TypeDef.h"
+#include "moses/Util.h"
+#include "moses/InputType.h"
+#include "moses/OutputCollector.h"
+#include "moses/Incremental.h"
+#include "mbr.h"
+
+#include "moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h"
+#include "moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h"
+
+#include "util/exception.hh"
+
+using namespace std;
+
+namespace Moses
+{
+
+TranslationTask::TranslationTask(InputType* source, Moses::IOWrapper &ioWrapper,
+ bool outputSearchGraphSLF,
+ boost::shared_ptr<HypergraphOutput<Manager> > hypergraphOutput)
+: m_source(source)
+, m_ioWrapper(ioWrapper)
+, m_outputSearchGraphSLF(outputSearchGraphSLF)
+, m_hypergraphOutput(hypergraphOutput)
+, m_pbOrChart(1)
+{}
+
+TranslationTask::TranslationTask(InputType *source, IOWrapper &ioWrapper,
+boost::shared_ptr<HypergraphOutput<ChartManager> > hypergraphOutputChart)
+: m_source(source)
+, m_ioWrapper(ioWrapper)
+, m_hypergraphOutputChart(hypergraphOutputChart)
+, m_pbOrChart(2)
+{}
+
+TranslationTask::~TranslationTask() {
+ delete m_source;
+}
+
+void TranslationTask::Run()
+{
+ switch (m_pbOrChart)
+ {
+ case 1:
+ RunPb();
+ break;
+ case 2:
+ RunChart();
+ break;
+ default:
+ UTIL_THROW(util::Exception, "Unknown value: " << m_pbOrChart);
+ }
+}
+
+
+void TranslationTask::RunPb()
+{
+ // shorthand for "global data"
+ const StaticData &staticData = StaticData::Instance();
+
+ // input sentence
+ Sentence sentence;
+
+ // report wall time spent on translation
+ Timer translationTime;
+ translationTime.start();
+
+ // report thread number
+#if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
+ TRACE_ERR("Translating line " << m_source->GetTranslationId() << " in thread id " << pthread_self() << endl);
+#endif
+
+
+ // execute the translation
+ // note: this executes the search, resulting in a search graph
+ // we still need to apply the decision rule (MAP, MBR, ...)
+ Timer initTime;
+ initTime.start();
+ Manager manager(*m_source,staticData.GetSearchAlgorithm());
+ VERBOSE(1, "Line " << m_source->GetTranslationId() << ": Initialize search took " << initTime << " seconds total" << endl);
+ manager.ProcessSentence();
+
+ // we are done with search, let's look what we got
+ Timer additionalReportingTime;
+ additionalReportingTime.start();
+
+ // output word graph
+ if (m_ioWrapper.GetWordGraphCollector()) {
+ ostringstream out;
+ fix(out,PRECISION);
+ manager.GetWordGraph(m_source->GetTranslationId(), out);
+ m_ioWrapper.GetWordGraphCollector()->Write(m_source->GetTranslationId(), out.str());
+ }
+
+ // output search graph
+ if (m_ioWrapper.GetSearchGraphOutputCollector()) {
+ ostringstream out;
+ fix(out,PRECISION);
+ manager.OutputSearchGraph(m_source->GetTranslationId(), out);
+ m_ioWrapper.GetSearchGraphOutputCollector()->Write(m_source->GetTranslationId(), out.str());
+
+#ifdef HAVE_PROTOBUF
+ if (staticData.GetOutputSearchGraphPB()) {
+ ostringstream sfn;
+ sfn << staticData.GetParam("output-search-graph-pb")[0] << '/' << m_source->GetTranslationId() << ".pb" << ends;
+ string fn = sfn.str();
+ VERBOSE(2, "Writing search graph to " << fn << endl);
+ fstream output(fn.c_str(), ios::trunc | ios::binary | ios::out);
+ manager.SerializeSearchGraphPB(m_source->GetTranslationId(), output);
+ }
+#endif
+ }
+
+ // Output search graph in HTK standard lattice format (SLF)
+ if (m_outputSearchGraphSLF) {
+ stringstream fileName;
+ fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_source->GetTranslationId() << ".slf";
+ ofstream *file = new ofstream;
+ file->open(fileName.str().c_str());
+ if (file->is_open() && file->good()) {
+ ostringstream out;
+ fix(out,PRECISION);
+ manager.OutputSearchGraphAsSLF(m_source->GetTranslationId(), out);
+ *file << out.str();
+ file -> flush();
+ } else {
+ TRACE_ERR("Cannot output HTK standard lattice for line " << m_source->GetTranslationId() << " because the output file is not open or not ready for writing" << endl);
+ }
+ delete file;
+ }
+
+ // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
+ if (m_hypergraphOutput.get()) {
+ m_hypergraphOutput->Write(manager);
+ }
+
+ additionalReportingTime.stop();
+
+ // apply decision rule and output best translation(s)
+ if (m_ioWrapper.GetSingleBestOutputCollector()) {
+ ostringstream out;
+ ostringstream debug;
+ fix(debug,PRECISION);
+
+ // all derivations - send them to debug stream
+ if (staticData.PrintAllDerivations()) {
+ additionalReportingTime.start();
+ manager.PrintAllDerivations(m_source->GetTranslationId(), debug);
+ additionalReportingTime.stop();
+ }
+
+ Timer decisionRuleTime;
+ decisionRuleTime.start();
+
+ // MAP decoding: best hypothesis
+ const Hypothesis* bestHypo = NULL;
+ if (!staticData.UseMBR()) {
+ bestHypo = manager.GetBestHypothesis();
+ if (bestHypo) {
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << bestHypo->GetTotalScore() << ' ';
+ }
+ if (staticData.IsPathRecoveryEnabled()) {
+ m_ioWrapper.OutputInput(out, bestHypo);
+ out << "||| ";
+ }
+ if (staticData.GetParam("print-id").size() && Scan<bool>(staticData.GetParam("print-id")[0]) ) {
+ out << m_source->GetTranslationId() << " ";
+ }
+
+ if (staticData.GetReportSegmentation() == 2) {
+ manager.GetOutputLanguageModelOrder(out, bestHypo);
+ }
+ m_ioWrapper.OutputBestSurface(
+ out,
+ bestHypo,
+ staticData.GetOutputFactorOrder(),
+ staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors());
+ if (staticData.PrintAlignmentInfo()) {
+ out << "||| ";
+ m_ioWrapper.OutputAlignment(out, bestHypo);
+ }
+
+ m_ioWrapper.OutputAlignment(m_ioWrapper.GetAlignmentInfoCollector(), m_source->GetTranslationId(), bestHypo);
+ IFVERBOSE(1) {
+ debug << "BEST TRANSLATION: " << *bestHypo << endl;
+ }
+ } else {
+ VERBOSE(1, "NO BEST TRANSLATION" << endl);
+ }
+
+ out << endl;
+ }
+
+ // MBR decoding (n-best MBR, lattice MBR, consensus)
+ else {
+ // we first need the n-best translations
+ size_t nBestSize = staticData.GetMBRSize();
+ if (nBestSize <= 0) {
+ cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl;
+ exit(1);
+ }
+ TrellisPathList nBestList;
+ manager.CalcNBest(nBestSize, nBestList,true);
+ VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl);
+ IFVERBOSE(2) {
+ PrintUserTime("calculated n-best list for (L)MBR decoding");
+ }
+
+ // lattice MBR
+ if (staticData.UseLatticeMBR()) {
+ if (m_ioWrapper.GetNBestOutputCollector()) {
+ //lattice mbr nbest
+ vector<LatticeMBRSolution> solutions;
+ size_t n = min(nBestSize, staticData.GetNBestSize());
+ getLatticeMBRNBest(manager,nBestList,solutions,n);
+ ostringstream out;
+ m_ioWrapper.OutputLatticeMBRNBest(out, solutions,m_source->GetTranslationId());
+ m_ioWrapper.GetNBestOutputCollector()->Write(m_source->GetTranslationId(), out.str());
+ } else {
+ //Lattice MBR decoding
+ vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
+ m_ioWrapper.OutputBestHypo(mbrBestHypo, m_source->GetTranslationId(), staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),out);
+ IFVERBOSE(2) {
+ PrintUserTime("finished Lattice MBR decoding");
+ }
+ }
+ }
+
+ // consensus decoding
+ else if (staticData.UseConsensusDecoding()) {
+ const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
+ m_ioWrapper.OutputBestHypo(conBestHypo, m_source->GetTranslationId(),
+ staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),out);
+ m_ioWrapper.OutputAlignment(m_ioWrapper.GetAlignmentInfoCollector(), m_source->GetTranslationId(), conBestHypo);
+ IFVERBOSE(2) {
+ PrintUserTime("finished Consensus decoding");
+ }
+ }
+
+ // n-best MBR decoding
+ else {
+ const TrellisPath &mbrBestHypo = doMBR(nBestList);
+ m_ioWrapper.OutputBestHypo(mbrBestHypo, m_source->GetTranslationId(),
+ staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),out);
+ m_ioWrapper.OutputAlignment(m_ioWrapper.GetAlignmentInfoCollector(), m_source->GetTranslationId(), mbrBestHypo);
+ IFVERBOSE(2) {
+ PrintUserTime("finished MBR decoding");
+ }
+ }
+ }
+
+ // report best translation to output collector
+ m_ioWrapper.GetSingleBestOutputCollector()->Write(m_source->GetTranslationId(),out.str(),debug.str());
+
+ decisionRuleTime.stop();
+ VERBOSE(1, "Line " << m_source->GetTranslationId() << ": Decision rule took " << decisionRuleTime << " seconds total" << endl);
+ }
+
+ additionalReportingTime.start();
+
+ // output n-best list
+ if (m_ioWrapper.GetNBestOutputCollector() && !staticData.UseLatticeMBR()) {
+ TrellisPathList nBestList;
+ ostringstream out;
+ manager.CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest());
+ m_ioWrapper.OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_source->GetTranslationId(),
+ staticData.GetReportSegmentation());
+ m_ioWrapper.GetNBestOutputCollector()->Write(m_source->GetTranslationId(), out.str());
+ }
+
+ //lattice samples
+ if (m_ioWrapper.GetLatticeSamplesCollector()) {
+ TrellisPathList latticeSamples;
+ ostringstream out;
+ manager.CalcLatticeSamples(staticData.GetLatticeSamplesSize(), latticeSamples);
+ m_ioWrapper.OutputNBest(out,latticeSamples, staticData.GetOutputFactorOrder(), m_source->GetTranslationId(),
+ staticData.GetReportSegmentation());
+ m_ioWrapper.GetLatticeSamplesCollector()->Write(m_source->GetTranslationId(), out.str());
+ }
+
+ // detailed translation reporting
+ if (m_ioWrapper.GetDetailedTranslationCollector()) {
+ ostringstream out;
+ fix(out,PRECISION);
+ TranslationAnalysis::PrintTranslationAnalysis(out, manager.GetBestHypothesis());
+ m_ioWrapper.GetDetailedTranslationCollector()->Write(m_source->GetTranslationId(),out.str());
+ }
+
+ //list of unknown words
+ if (m_ioWrapper.GetUnknownsCollector()) {
+ const vector<const Phrase*>& unknowns = manager.getSntTranslationOptions()->GetUnknownSources();
+ ostringstream out;
+ for (size_t i = 0; i < unknowns.size(); ++i) {
+ out << *(unknowns[i]);
+ }
+ out << endl;
+ m_ioWrapper.GetUnknownsCollector()->Write(m_source->GetTranslationId(), out.str());
+ }
+
+ // report additional statistics
+ manager.CalcDecoderStatistics();
+ VERBOSE(1, "Line " << m_source->GetTranslationId() << ": Additional reporting took " << additionalReportingTime << " seconds total" << endl);
+ VERBOSE(1, "Line " << m_source->GetTranslationId() << ": Translation took " << translationTime << " seconds total" << endl);
+ IFVERBOSE(2) {
+ PrintUserTime("Sentence Decoding Time:");
+ }
+}
+
+
+void TranslationTask::RunChart()
+{
+ const StaticData &staticData = StaticData::Instance();
+ const size_t translationId = m_source->GetTranslationId();
+
+ VERBOSE(2,"\nTRANSLATING(" << translationId << "): " << *m_source);
+
+ if (staticData.UseS2TDecoder()) {
+ S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
+ if (algorithm == RecursiveCYKPlus) {
+ typedef Syntax::S2T::EagerParserCallback Callback;
+ typedef Syntax::S2T::RecursiveCYKPlusParser<Callback> Parser;
+ DecodeS2T<Parser>();
+ } else if (algorithm == Scope3) {
+ typedef Syntax::S2T::StandardParserCallback Callback;
+ typedef Syntax::S2T::Scope3Parser<Callback> Parser;
+ DecodeS2T<Parser>();
+ } else {
+ UTIL_THROW2("ERROR: unhandled S2T parsing algorithm");
+ }
+ return;
+ }
+
+ if (staticData.GetSearchAlgorithm() == ChartIncremental) {
+ Incremental::Manager manager(*m_source);
+ const std::vector<search::Applied> &nbest = manager.ProcessSentence();
+ if (!nbest.empty()) {
+ m_ioWrapper.OutputBestHypo(nbest[0], translationId);
+ if (staticData.IsDetailedTranslationReportingEnabled()) {
+ const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
+ m_ioWrapper.OutputDetailedTranslationReport(&nbest[0], sentence, translationId);
+ }
+ if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
+ const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
+ m_ioWrapper.OutputDetailedTreeFragmentsTranslationReport(&nbest[0], sentence, translationId);
+ }
+ } else {
+ m_ioWrapper.OutputBestNone(translationId);
+ }
+ if (staticData.GetNBestSize() > 0)
+ m_ioWrapper.OutputNBestList(nbest, translationId);
+ return;
+ }
+
+ ChartManager manager(*m_source);
+ manager.ProcessSentence();
+
+ UTIL_THROW_IF2(staticData.UseMBR(), "Cannot use MBR");
+
+ // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
+ if (m_hypergraphOutputChart.get()) {
+ m_hypergraphOutputChart->Write(manager);
+ }
+
+
+ // 1-best
+ const ChartHypothesis *bestHypo = manager.GetBestHypothesis();
+ m_ioWrapper.OutputBestHypo(bestHypo, translationId);
+ IFVERBOSE(2) {
+ PrintUserTime("Best Hypothesis Generation Time:");
+ }
+
+ if (!staticData.GetAlignmentOutputFile().empty()) {
+ m_ioWrapper.OutputAlignment(translationId, bestHypo);
+ }
+
+ if (staticData.IsDetailedTranslationReportingEnabled()) {
+ const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
+ m_ioWrapper.OutputDetailedTranslationReport(bestHypo, sentence, translationId);
+ }
+ if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
+ const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
+ m_ioWrapper.OutputDetailedTreeFragmentsTranslationReport(bestHypo, sentence, translationId);
+ }
+ if (!staticData.GetOutputUnknownsFile().empty()) {
+ m_ioWrapper.OutputUnknowns(manager.GetParser().GetUnknownSources(),
+ translationId);
+ }
+
+ //DIMw
+ if (staticData.IsDetailedAllTranslationReportingEnabled()) {
+ const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
+ size_t nBestSize = staticData.GetNBestSize();
+ std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
+ manager.CalcNBest(nBestSize, nBestList, staticData.GetDistinctNBest());
+ m_ioWrapper.OutputDetailedAllTranslationReport(nBestList, manager, sentence, translationId);
+ }
+
+ // n-best
+ size_t nBestSize = staticData.GetNBestSize();
+ if (nBestSize > 0) {
+ VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl);
+ std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
+ manager.CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest());
+ m_ioWrapper.OutputNBestList(nBestList, translationId);
+ IFVERBOSE(2) {
+ PrintUserTime("N-Best Hypotheses Generation Time:");
+ }
+ }
+
+ if (staticData.GetOutputSearchGraph()) {
+ std::ostringstream out;
+ manager.OutputSearchGraphMoses( out);
+ OutputCollector *oc = m_ioWrapper.GetSearchGraphOutputCollector();
+ UTIL_THROW_IF2(oc == NULL, "File for search graph output not specified");
+ oc->Write(translationId, out.str());
+ }
+
+ IFVERBOSE(2) {
+ PrintUserTime("Sentence Decoding Time:");
+ }
+ manager.CalcDecoderStatistics();
+}
+
+}
diff --git a/moses/TranslationTask.h b/moses/TranslationTask.h
new file mode 100644
index 000000000..3b5fe26d1
--- /dev/null
+++ b/moses/TranslationTask.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <boost/smart_ptr/shared_ptr.hpp>
+#include "moses/ThreadPool.h"
+#include "moses/Manager.h"
+#include "moses/HypergraphOutput.h"
+#include "moses/IOWrapper.h"
+#include "moses/Manager.h"
+#include "moses/ChartManager.h"
+
+#include "moses/Syntax/S2T/Manager.h"
+
+namespace Moses
+{
+class InputType;
+class OutputCollector;
+
+
+/** Translates a sentence.
+ * - calls the search (Manager)
+ * - applies the decision rule
+ * - outputs best translation and additional reporting
+ **/
+class TranslationTask : public Moses::Task
+{
+
+public:
+
+ TranslationTask(Moses::InputType* source, Moses::IOWrapper &ioWrapper,
+ bool outputSearchGraphSLF,
+ boost::shared_ptr<Moses::HypergraphOutput<Moses::Manager> > hypergraphOutput);
+
+ TranslationTask(Moses::InputType *source, IOWrapper &ioWrapper,
+ boost::shared_ptr<Moses::HypergraphOutput<Moses::ChartManager> > hypergraphOutputChart);
+
+ ~TranslationTask();
+
+ /** Translate one sentence
+ * gets called by main function implemented at end of this source file */
+ void Run();
+
+
+private:
+ int m_pbOrChart; // 1=pb. 2=chart
+ Moses::InputType* m_source;
+ Moses::IOWrapper &m_ioWrapper;
+
+ bool m_outputSearchGraphSLF;
+ boost::shared_ptr<Moses::HypergraphOutput<Moses::Manager> > m_hypergraphOutput;
+ boost::shared_ptr<Moses::HypergraphOutput<Moses::ChartManager> > m_hypergraphOutputChart;
+
+ void RunPb();
+ void RunChart();
+
+
+ template<typename Parser>
+ void DecodeS2T() {
+ const StaticData &staticData = StaticData::Instance();
+ const std::size_t translationId = m_source->GetTranslationId();
+ Syntax::S2T::Manager<Parser> manager(*m_source);
+ manager.Decode();
+ // 1-best
+ const Syntax::SHyperedge *best = manager.GetBestSHyperedge();
+ m_ioWrapper.OutputBestHypo(best, translationId);
+ // n-best
+ if (staticData.GetNBestSize() > 0) {
+ Syntax::KBestExtractor::KBestVec nBestList;
+ manager.ExtractKBest(staticData.GetNBestSize(), nBestList,
+ staticData.GetDistinctNBest());
+ m_ioWrapper.OutputNBestList(nBestList, translationId);
+ }
+ // Write 1-best derivation (-translation-details / -T option).
+ if (staticData.IsDetailedTranslationReportingEnabled()) {
+ m_ioWrapper.OutputDetailedTranslationReport(best, translationId);
+ }
+ // Write unknown words file (-output-unknowns option)
+ if (!staticData.GetOutputUnknownsFile().empty()) {
+ m_ioWrapper.OutputUnknowns(manager.GetUnknownWords(), translationId);
+ }
+ }
+
+};
+
+
+} //namespace
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index cdcb99dbc..d7cf3b367 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -74,6 +74,9 @@ const float DEFAULT_EARLY_DISCARDING_THRESHOLD = 0.0f;
const float DEFAULT_TRANSLATION_OPTION_THRESHOLD = 0.0f;
const size_t DEFAULT_VERBOSE_LEVEL = 1;
+// output floats with five significant digits
+static const size_t PRECISION = 3;
+
// enums.
// must be 0, 1, 2, ..., unless otherwise stated
@@ -160,6 +163,11 @@ enum FormatType {
,HieroFormat
};
+enum S2TParsingAlgorithm {
+ RecursiveCYKPlus,
+ Scope3
+};
+
// typedef
typedef size_t FactorType;
diff --git a/moses/Util.cpp b/moses/Util.cpp
index f92c32dbb..9664c811e 100644
--- a/moses/Util.cpp
+++ b/moses/Util.cpp
@@ -37,6 +37,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Timer.h"
#include "util/exception.hh"
#include "util/file.hh"
+#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/StaticData.h"
using namespace std;
@@ -203,6 +206,44 @@ std::string PassthroughSGML(std::string &line, const std::string tagName, const
return meta;
}
+void PrintFeatureWeight(const FeatureFunction* ff)
+{
+ cout << ff->GetScoreProducerDescription() << "=";
+ size_t numScoreComps = ff->GetNumScoreComponents();
+ vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+ for (size_t i = 0; i < numScoreComps; ++i) {
+ cout << " " << values[i];
+ }
+ cout << endl;
+
}
+void ShowWeights()
+{
+ fix(cout,6);
+ const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+
+ for (size_t i = 0; i < sff.size(); ++i) {
+ const StatefulFeatureFunction *ff = sff[i];
+ if (ff->IsTuneable()) {
+ PrintFeatureWeight(ff);
+ }
+ else {
+ cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
+ }
+ }
+ for (size_t i = 0; i < slf.size(); ++i) {
+ const StatelessFeatureFunction *ff = slf[i];
+ if (ff->IsTuneable()) {
+ PrintFeatureWeight(ff);
+ }
+ else {
+ cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
+ }
+ }
+}
+
+} // namespace
+
diff --git a/moses/Util.h b/moses/Util.h
index 24a4e2c28..acaa4b53c 100644
--- a/moses/Util.h
+++ b/moses/Util.h
@@ -431,7 +431,19 @@ T log_sum (T log_a, T log_b)
return ( v );
}
-
+/** Enforce rounding */
+inline void fix(std::ostream& stream, size_t size)
+{
+ stream.setf(std::ios::fixed);
+ stream.precision(size);
}
+class FeatureFunction;
+
+void PrintFeatureWeight(const FeatureFunction* ff);
+void ShowWeights();
+
+
+} // namespace
+
#endif
diff --git a/moses/XmlOption.cpp b/moses/XmlOption.cpp
index 52779eaf9..2f66d647e 100644
--- a/moses/XmlOption.cpp
+++ b/moses/XmlOption.cpp
@@ -24,6 +24,8 @@
#include <vector>
#include <string>
#include <iostream>
+#include <boost/foreach.hpp>
+#include <boost/unordered_map.hpp>
#include "Util.h"
#include "StaticData.h"
#include "WordsRange.h"
@@ -348,6 +350,52 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
#endif
}
+ // weight-overwrite: update feature weights, unspecified weights remain unchanged
+ // IMPORTANT: translation models that cache phrases or apply table-limit during load
+ // based on initial weights need to be reset. Sending an empty update will do this
+ // for PhraseDictionaryBitextSampling (Mmsapt) models:
+ // <update name="TranslationModelName" source=" " target=" " alignment=" " />
+ else if (tagName == "weight-overwrite") {
+
+ // is a name->ff map stored anywhere so we don't have to build it every time?
+ const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
+ boost::unordered_map<string, FeatureFunction*> map;
+ BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
+ map[ff->GetScoreProducerDescription()] = ff;
+ }
+
+ // update each weight listed
+ ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights();
+ boost::unordered_map<string, FeatureFunction*>::iterator ffi;
+ string ffName("");
+ vector<float> ffWeights;
+ vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights"));
+ BOOST_FOREACH(string const& tok, toks) {
+ if (tok.substr(tok.size() - 1, 1) == "=") {
+ // start new feature
+ if (ffName != "") {
+ // set previous feature weights
+ if (ffi != map.end()) {
+ allWeights.Assign(ffi->second, ffWeights);
+ }
+ ffWeights.clear();
+ }
+ ffName = tok.substr(0, tok.size() - 1);
+ ffi = map.find(ffName);
+ if (ffi == map.end()) {
+ TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl);
+ }
+ } else {
+ // weight for current feature
+ ffWeights.push_back(Scan<float>(tok));
+ }
+ }
+ if (ffi != map.end()) {
+ allWeights.Assign(ffi->second, ffWeights);
+ }
+ StaticData::InstanceNonConst().SetAllWeights(allWeights);
+ }
+
// default: opening tag that specifies translation options
else {
if (startPos > endPos) {
diff --git a/moses-cmd/mbr.cpp b/moses/mbr.cpp
index 6a8dfa823..6a8dfa823 100644
--- a/moses-cmd/mbr.cpp
+++ b/moses/mbr.cpp
diff --git a/moses-cmd/mbr.h b/moses/mbr.h
index d08b11a98..d08b11a98 100644
--- a/moses-cmd/mbr.h
+++ b/moses/mbr.h
diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 534ab177b..7af06405c 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -52,14 +52,18 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
{
//Check that configure rejects illegal domain arg combinations
ScoreFeatureManager manager;
- vector<string> args = boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null");
- BOOST_CHECK_THROW(manager.configure(args), ScoreFeatureArgumentException);
- args = boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null");
- BOOST_CHECK_THROW(manager.configure(args), ScoreFeatureArgumentException);
- args = boost::assign::list_of("--SparseDomainBlah")("/dev/null");
- BOOST_CHECK_THROW(manager.configure(args), ScoreFeatureArgumentException);
- args = boost::assign::list_of("--DomainSubset");
- BOOST_CHECK_THROW(manager.configure(args), ScoreFeatureArgumentException);
+ BOOST_CHECK_THROW(
+ manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")),
+ ScoreFeatureArgumentException);
+ BOOST_CHECK_THROW(
+ manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")),
+ ScoreFeatureArgumentException);
+ BOOST_CHECK_THROW(
+ manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")),
+ ScoreFeatureArgumentException);
+ BOOST_CHECK_THROW(
+ manager.configure(boost::assign::list_of("--DomainSubset")),
+ ScoreFeatureArgumentException);
}
template <class Expected>
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index 592ff7518..10697a956 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -354,7 +354,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " |||";
if (itemDirect.size() >= 6) {
//if (sourceLabelsFlag) {
- fileConsolidated << " " << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
+ fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
//} else {
// fileConsolidated << itemDirect[5];
//}
diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp
index 44cf2006c..7e084e495 100644
--- a/phrase-extract/extract-ghkm/Alignment.cpp
+++ b/phrase-extract/extract-ghkm/Alignment.cpp
@@ -21,6 +21,7 @@
#include "Exception.h"
+#include <algorithm>
#include <cassert>
#include <cstdlib>
@@ -61,5 +62,12 @@ void ReadAlignment(const std::string &s, Alignment &a)
}
}
+void FlipAlignment(Alignment &a)
+{
+ for (Alignment::iterator p = a.begin(); p != a.end(); ++p) {
+ std::swap(p->first, p->second);
+ }
+}
+
} // namespace GHKM
} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h
index 8dbaf483f..5aa24a712 100644
--- a/phrase-extract/extract-ghkm/Alignment.h
+++ b/phrase-extract/extract-ghkm/Alignment.h
@@ -34,6 +34,8 @@ typedef std::vector<std::pair<int, int> > Alignment;
void ReadAlignment(const std::string &, Alignment &);
+void FlipAlignment(Alignment &);
+
} // namespace GHKM
} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index e2d4fa436..3747f7a16 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -60,8 +60,18 @@ int ExtractGHKM::Main(int argc, char *argv[])
ProcessOptions(argc, argv, options);
// Open input files.
- InputFileStream targetStream(options.targetFile);
- InputFileStream sourceStream(options.sourceFile);
+ //
+ // The GHKM algorithm is neutral about whether the model is string-to-tree or
+ // tree-to-string. This implementation assumes the model to be
+ // string-to-tree, but if the -t2s option is given then the source and target
+ // input files are switched prior to extraction and then the source and
+ // target of the extracted rules are switched on output.
+ std::string effectiveTargetFile = options.t2s ? options.sourceFile
+ : options.targetFile;
+ std::string effectiveSourceFile = options.t2s ? options.targetFile
+ : options.sourceFile;
+ InputFileStream targetStream(effectiveTargetFile);
+ InputFileStream sourceStream(effectiveSourceFile);
InputFileStream alignmentStream(options.alignmentFile);
// Open output files.
@@ -215,6 +225,9 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::cerr << "skipping line " << lineNum << " without alignment points\n";
continue;
}
+ if (options.t2s) {
+ FlipAlignment(alignment);
+ }
// Record word counts.
if (!options.targetUnknownWordFile.empty()) {
@@ -436,6 +449,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
"include score based on PCFG scores in target corpus")
("PhraseOrientation",
"output phrase orientation information")
+ ("T2S",
+ "enable tree-to-string rule extraction (string-to-tree is assumed by default)")
("TreeFragments",
"output parse tree information")
("SourceLabels",
@@ -543,6 +558,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("PhraseOrientation")) {
options.phraseOrientation = true;
}
+ if (vm.count("T2S")) {
+ options.t2s = true;
+ }
if (vm.count("TreeFragments")) {
options.treeFragments = true;
}
diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h
index 0102e2f64..483479041 100644
--- a/phrase-extract/extract-ghkm/Options.h
+++ b/phrase-extract/extract-ghkm/Options.h
@@ -41,12 +41,13 @@ public:
, minimal(false)
, pcfg(false)
, phraseOrientation(false)
- , treeFragments(false)
- , sourceLabels(false)
, sentenceOffset(0)
- , unpairedExtractFormat(false)
+ , sourceLabels(false)
+ , t2s(false)
+ , treeFragments(false)
, unknownWordMinRelFreq(0.03f)
- , unknownWordUniform(false) {}
+ , unknownWordUniform(false)
+ , unpairedExtractFormat(false) {}
// Positional options
std::string targetFile;
@@ -66,16 +67,17 @@ public:
bool minimal;
bool pcfg;
bool phraseOrientation;
- bool treeFragments;
+ int sentenceOffset;
bool sourceLabels;
std::string sourceLabelSetFile;
- int sentenceOffset;
- bool unpairedExtractFormat;
- std::string targetUnknownWordFile;
std::string sourceUnknownWordFile;
- std::string unknownWordSoftMatchesFile;
+ bool t2s;
+ std::string targetUnknownWordFile;
+ bool treeFragments;
float unknownWordMinRelFreq;
+ std::string unknownWordSoftMatchesFile;
bool unknownWordUniform;
+ bool unpairedExtractFormat;
};
} // namespace GHKM
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
index 4e0403d70..aa843c3c1 100644
--- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
@@ -38,7 +38,6 @@ PhraseOrientation::PhraseOrientation(int sourceSize,
const Alignment &alignment)
: m_countF(sourceSize)
, m_countE(targetSize)
- , m_alignment(alignment)
{
// prepare data structures for alignments
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h
index 0a94ad4d8..8ef05987f 100644
--- a/phrase-extract/extract-ghkm/PhraseOrientation.h
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.h
@@ -82,7 +82,6 @@ private:
const int m_countF;
const int m_countE;
- const Alignment &m_alignment;
std::vector<std::vector<int> > m_alignedToT;
diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h
index 5f1f35a61..2a57fdded 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@@ -40,7 +40,7 @@ class Subgraph;
enum SymbolType { Terminal, NonTerminal };
-struct Symbol {
+class Symbol {
public:
Symbol(const std::string &v, SymbolType t) : m_value(v) , m_type(t) {}
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index 2fba6930b..d306b845f 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -47,14 +47,26 @@ void ScfgRuleWriter::Write(const ScfgRule &rule, bool printEndl)
}
// Write the rule to the forward and inverse extract files.
- m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
- m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+ if (m_options.t2s) {
+ // If model is tree-to-string then flip the source and target.
+ m_fwd << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+ m_inv << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+ } else {
+ m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+ m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+ }
const Alignment &alignment = rule.GetAlignment();
for (Alignment::const_iterator p = alignment.begin();
p != alignment.end(); ++p) {
- m_fwd << " " << p->first << "-" << p->second;
- m_inv << " " << p->second << "-" << p->first;
+ if (m_options.t2s) {
+ // If model is tree-to-string then flip the source and target.
+ m_fwd << " " << p->second << "-" << p->first;
+ m_inv << " " << p->first << "-" << p->second;
+ } else {
+ m_fwd << " " << p->first << "-" << p->second;
+ m_inv << " " << p->second << "-" << p->first;
+ }
}
// Write a count of 1.
@@ -66,6 +78,8 @@ void ScfgRuleWriter::Write(const ScfgRule &rule, bool printEndl)
m_fwd << " ||| " << std::exp(rule.GetPcfgScore());
}
+ m_fwd << " |||";
+
if (m_options.sourceLabels && rule.HasSourceLabels()) {
m_fwd << " {{SourceLabels";
rule.PrintSourceLabels(m_fwd);
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index 8a8564580..240492824 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -32,7 +32,7 @@ namespace GHKM
struct Options;
class ScfgRule;
-struct Symbol;
+class Symbol;
class ScfgRuleWriter
{
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 676c145e2..50baa4e0d 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -1119,8 +1119,8 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
ofstream grammarFile;
grammarFile.open(fileName.c_str());
if (!options.targetSyntax) {
- grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
- << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
+ grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| 0-0 ||| 0" << endl
+ << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 1-1 ||| 0" << endl
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
} else {
// chose a top label that is not already a label
@@ -1132,13 +1132,13 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
}
}
// basic rules
- grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| " << endl
- << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << endl;
+ grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| 0-0" << endl
+ << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1" << endl;
// top rules
for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
i != targetTopLabelCollection.end(); i++ ) {
- grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << endl;
+ grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2" << endl;
}
// glue rules
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 7f155f6ed..239492613 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -475,7 +475,7 @@ void processLine( std::string line,
std::string &additionalPropertiesString,
float &count, float &pcfgSum )
{
- size_t foundAdditionalProperties = line.find("{{");
+ size_t foundAdditionalProperties = line.find("||| {{");
if (foundAdditionalProperties != std::string::npos) {
additionalPropertiesString = line.substr(foundAdditionalProperties);
line = line.substr(0,foundAdditionalProperties);
diff --git a/regression-testing/Jamfile b/regression-testing/Jamfile
index 78349ea2c..b2ba7cce1 100644
--- a/regression-testing/Jamfile
+++ b/regression-testing/Jamfile
@@ -25,7 +25,7 @@ if $(with-regtest) {
$(TOP)/regression-testing/run-single-test.perl --decoder=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
}
reg_test phrase : [ glob $(test-dir)/phrase.* ] : ../moses-cmd//moses : @reg_test_decode ;
- reg_test chart : [ glob $(test-dir)/chart.* ] : ../moses-chart-cmd//moses_chart : @reg_test_decode ;
+ reg_test chart : [ glob $(test-dir)/chart.* ] : ../moses-cmd//moses : @reg_test_decode ;
actions reg_test_score {
$(TOP)/regression-testing/run-test-scorer.perl --scorer=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 27a5e843e..4dbe6751f 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -116,14 +116,14 @@ consolidate
in: CORPUS:clean-parsed-stem
out: tokenized-stem
default-name: truecaser/corpus
- pass-unless: trainer
+ pass-unless: trainer
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
error: number of lines don't match
train
in: tokenized-stem
out: truecase-model
rerun-on-change: trainer
- pass-unless: trainer
+ pass-unless: trainer
default-name: truecaser/truecase-model
template: $trainer -model OUT.$input-extension -corpus IN.$input-extension ; $trainer -model OUT.$output-extension -corpus IN.$output-extension
@@ -643,7 +643,7 @@ build-sparse
create-config
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
out: config
- ignore-if: use-hiero
+ ignore-if: use-hiero thot
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt
default-name: model/moses.ini
error: Unknown option
@@ -700,6 +700,18 @@ hiero-create-config
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors
default-name: hiero-model/hiero.ini
template: $hiero-util-dir/generate-ini.pl IN IN1 IN2 IN3 IN4 IN5 $hiero-max-phrase-length $hiero-max-nonterminals $hiero-max-phrase-span $hiero-min-gap-length $hiero-freq-rank1 $hiero-freq-rank2 < $GENERAL:hiero-template-ini > OUT
+thot-build-ttable
+ in: corpus
+ out: thot-ttable
+ default-name: model/phrase-table-thot
+ rerun-on-change: input-extension output-extension
+ template: $thot/thot_tm_train -sdir $working-dir -s IN.$input-extension -t IN.$output-extension -o OUT
+thot-create-config
+ in: thot-ttable LM:lm
+ out: config
+ ignore-unless: thot
+ default-name: model/thot.ini
+ template: $thot/thot_gen_server_cfg_file IN1/lm_desc IN/tm_desc > OUT
[TUNING] single
input-from-sgm
@@ -968,10 +980,17 @@ tune
final-model: yes
rerun-on-change: decoder-settings tuning-settings nbest lambda async
not-error: trans: No such file or directory
+thot-tune
+ in: TRAINING:config input reference
+ out: config-with-reused-weights
+ ignore-unless: thot
+ tmp-name: tuning/thot.tmp
+ default-name: tuning/thot.tuned.ini
+ template: mkdir -p TMP/home ; mkdir -p TMP/tdir ; mkdir -p TMP/sdir ; HOME=TMP/home $thot/thot_smt_tune -tdir TMP/tdir -sdir TMP/sdir -c IN -s IN1 -t IN2 -o OUT
apply-weights
in: TRAINING:bin-config weight-config
out: config-with-reused-weights
- ignore-if: use-hiero
+ ignore-if: use-hiero thot
default-name: tuning/moses.tuned.ini
template: $moses-script-dir/ems/support/substitute-weights.perl IN IN1 OUT
error: cannot open
@@ -1078,14 +1097,14 @@ apply-filter
in: filtered-dir TRAINING:config TUNING:config-with-reused-weights
out: filtered-config
default-name: evaluation/filtered.ini
- ignore-if: TRAINING:binarize-all
+ ignore-if: TRAINING:binarize-all thot
template: $moses-script-dir/ems/support/substitute-filtered-tables-and-weights.perl IN/moses.ini IN1 IN2 OUT
decode
in: TUNING:config-with-reused-weights input filtered-config
out: system-output
default-name: evaluation/output
qsub-script: yes
- ignore-if: use-hiero
+ ignore-if: use-hiero thot
rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade TRAINING:post-decoding-transliteration
error: Translation was not performed correctly
not-error: trans: No such file or directory
@@ -1098,6 +1117,20 @@ hiero-decode
ignore-unless: use-hiero
template: $hiero-parallelizer -e OUT.edir -r -- $hiero-decoder -c IN < IN1 > OUT
rerun-on-change: hiero-decoder
+thot-filter
+ in: TUNING:config-with-reused-weights input
+ out: filtered-config
+ ignore-unless: thot
+ default-name: evaluation/filtered
+ tmp-name: evaluation/filtered-tmp
+ template: mkdir -p TMP/home ; mkdir -p TMP/tdir ; mkdir -p TMP/sdir ; HOME=TMP/home $thot/thot_prepare_sys_for_test -sdir TMP/sdir -tdir TMP/tdir -t IN1 -c IN/tuned_for_dev.cfg -o OUT ; cp OUT/lm/main/* OUT/lm
+thot-decode
+ in: input filtered-config
+ out: system-output
+ ignore-unless: thot
+ default-name: evaluation/output
+ template: $thot/thot_decoder -sdir $working-dir -c IN1/test_specific.cfg -t IN > OUT
+ not-error: Error in word penalty model file
remove-markup
in: system-output
out: cleaned-output
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 10f4258f0..1e47bb6b9 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -281,6 +281,7 @@ sub read_meta {
$escaped_template =~ s/^IN/EMS_IN_EMS/;
$escaped_template =~ s/ IN(\d*)/ EMS_IN$1_EMS/g;
$escaped_template =~ s/ OUT/ EMS_OUT_EMS/g;
+ $escaped_template =~ s/TMP/EMS_TMP_EMS/g;
$TEMPLATE{"$module:$step"} = $escaped_template;
}
elsif ($1 eq "template-if") {
@@ -288,6 +289,7 @@ sub read_meta {
$escaped_template =~ s/^IN/EMS_IN_EMS/;
$escaped_template =~ s/ IN(\d*)/ EMS_IN$1_EMS/g;
$escaped_template =~ s/ OUT/ EMS_OUT_EMS/g;
+ $escaped_template =~ s/TMP/EMS_TMP_EMS/g;
my @IF = split(/\s+/,$escaped_template);
push @{$TEMPLATE_IF{"$module:$step"}}, \@IF;
}
@@ -3295,6 +3297,7 @@ sub define_template {
# replace IN and OUT with %s
$single_cmd =~ s/EMS_IN_EMS\S*/\%s/;
$single_cmd =~ s/EMS_OUT_EMS\S*/\%s/;
+ $single_cmd =~ s/EMS_SLASH_OUT_EMS\S*/\%s/;
# build tmp
my $tmp_dir = $module;
$tmp_dir =~ tr/A-Z/a-z/;
@@ -3335,6 +3338,10 @@ sub define_template {
$cmd =~ s/EMS_IN_EMS/$INPUT[0]/g;
}
$cmd =~ s/EMS_OUT_EMS/$output/g;
+ if (defined($STEP_TMPNAME{"$module:$stepname"})) {
+ my $tmp = $dir."/".$STEP_TMPNAME{"$module:$stepname"}.".$VERSION";
+ $cmd =~ s/EMS_TMP_EMS/$tmp/g;
+ }
$cmd =~ s/VERSION/$VERSION/g;
print "\tcmd is $cmd\n" if $VERBOSE;
while ($cmd =~ /^([\S\s]*)\$\{([^\s\/\"\']+)\}([\S\s]*)$/ ||
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index a2f9580a9..be5b76a5e 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -745,37 +745,15 @@ sub hierarchical_segmentation {
open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!";
open(NODE,">$dir/node") or die "Cannot open: $!";
while(<TRACE>) {
- /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
- /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_");
- my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
- if ($last_sentence >= 0 && $sentence != $last_sentence) {
- &hs_process($last_sentence,\@DERIVATION,\%STATS);
- @DERIVATION = ();
- }
- my %ITEM;
- $ITEM{'start'} = $start;
- $ITEM{'end'} = $end;
- $ITEM{'rule_lhs'} = $rule_lhs;
-
- $rule_rhs =~ s/</&lt;/g;
- $rule_rhs =~ s/>/&gt;/g;
- @{$ITEM{'rule_rhs'}} = split(/ /,$rule_rhs);
-
- foreach (split(/ /,$alignment)) {
- /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
- $ITEM{'alignment'}{$2} = $1; # target non-terminal to source span
- $ITEM{'alignedSpan'}{$1} = 1;
- }
-
- @{$ITEM{'spans'}} = ();
- foreach my $span (reverse split(/\s+/,$spans)) {
- $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n");
- my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
- push @{$ITEM{'spans'}}, \%SPAN;
- }
-
- push @DERIVATION,\%ITEM;
- $last_sentence = $sentence;
+ my $sentence;
+ my %ITEM;
+ &hs_scan_line($_, \$sentence, \%ITEM) || die("cannot scan line $_");
+ if ($last_sentence >= 0 && $sentence != $last_sentence) {
+ &hs_process($last_sentence,\@DERIVATION,\%STATS);
+ @DERIVATION = ();
+ }
+ push @DERIVATION,\%ITEM;
+ $last_sentence = $sentence;
}
&hs_process($last_sentence,\@DERIVATION,\%STATS);
close(TRACE);
@@ -793,6 +771,84 @@ sub hierarchical_segmentation {
close(SUMMARY);
}
+# scan a single line of the trace file
+sub hs_scan_line {
+ my ($line,$ref_sentence,$ref_item) = @_;
+
+ if ($line =~ /^Trans Opt/) {
+ # Old format
+ $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
+ $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || return 0;
+ my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
+
+ ${$ref_sentence} = $sentence;
+
+ $ref_item->{'start'} = $start;
+ $ref_item->{'end'} = $end;
+ $ref_item->{'rule_lhs'} = $rule_lhs;
+
+ $rule_rhs =~ s/</&lt;/g;
+ $rule_rhs =~ s/>/&gt;/g;
+ @{$ref_item->{'rule_rhs'}} = split(/ /,$rule_rhs);
+
+ foreach (split(/ /,$alignment)) {
+ /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
+ $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
+ $ref_item->{'alignedSpan'}{$1} = 1;
+ }
+
+ @{$ref_item->{'spans'}} = ();
+ foreach my $span (reverse split(/\s+/,$spans)) {
+ $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n");
+ my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
+ push @{$ref_item->{'spans'}}, \%SPAN;
+ }
+ } else {
+ # New format
+ $line =~ /^(\d+) \|\|\| \[\S+\] -> (.+) \|\|\| \[(\S+)\] -> (.+) \|\|\| (.*)\|\|\| (.*)/ || return 0;
+ my ($sentence,$source_rhs,$target_lhs,$target_rhs,$alignment,$source_spans) = ($1,$2,$3,$4,$5,$6);
+
+ ${$ref_sentence} = $sentence;
+
+ @{$ref_item->{'spans'}} = ();
+ foreach (split(/ /,$source_rhs)) {
+ /^\[?([^\]]+)\]?$/;
+ my %SPAN = ( 'word' => $1 );
+ push @{$ref_item->{'spans'}}, \%SPAN;
+ }
+
+ my $i = 0;
+ foreach my $span (split(/ /,$source_spans)) {
+ $span =~ /(\d+)\.\.(\d+)/ || die("funny span: $span\n");
+ $ref_item->{'spans'}[$i]{'from'} = $1;
+ $ref_item->{'spans'}[$i]{'to'} = $2;
+ if ($i == 0) {
+ $ref_item->{'start'} = $1;
+ }
+ $ref_item->{'end'} = $2;
+ $i++;
+ }
+
+ $ref_item->{'rule_lhs'} = $target_lhs;
+
+ $target_rhs =~ s/</&lt;/g;
+ $target_rhs =~ s/>/&gt;/g;
+ @{$ref_item->{'rule_rhs'}} = ();
+ foreach (split(/ /,$target_rhs)) {
+ /^\[?([^\]]+)\]?$/;
+ push @{$ref_item->{'rule_rhs'}}, $1;
+ }
+
+ foreach (split(/ /,$alignment)) {
+ /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
+ $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
+ $ref_item->{'alignedSpan'}{$1} = 1;
+ }
+ }
+
+ return 1;
+}
+
# process a single sentence for hierarchical segmentation
sub hs_process {
my ($sentence,$DERIVATION,$STATS) = @_;
diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl
index 9e4b35a77..722f02701 100755
--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@@ -19,7 +19,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
$factor = $1 if $feature_spec =~ / factor ([\d\-]+)/;
if ($SPEC[0] eq 'target-word-insertion') {
- $ini .= "TargetWordInsertionFeature factor=$factor";
+ $ini .= "TargetWordInsertionFeature name=TWI factor=$factor";
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($output_extension, $SPEC[2]);
@@ -33,7 +33,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
$ini .= "\n";
}
elsif ($SPEC[0] eq 'source-word-deletion') {
- $ini .= "SourceWordDeletionFeature factor=$factor";
+ $ini .= "SourceWordDeletionFeature name=SWD factor=$factor";
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($input_extension, $SPEC[2]);
$ini .= " path=$file";
@@ -59,10 +59,10 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
die("ERROR: Unknown parameter specification in '$SPEC[1]'\n");
}
my ($input_factor,$output_factor) = split(/\-/,$factor);
- $ini .= "WordTranslationFeature input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
+ $ini .= "WordTranslationFeature name=WT input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
}
elsif ($SPEC[0] eq 'phrase-length') {
- $ini .= "PhraseLengthFeature\n";
+ $ini .= "PhraseLengthFeature name=PL\n";
}
else {
die("ERROR: Unknown feature type '$SPEC[0]' in specification '$feature_spec'\nfull spec: '$specification'\n");
diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl
index 3efb243d7..be1509b8f 100755
--- a/scripts/ems/support/substitute-filtered-tables.perl
+++ b/scripts/ems/support/substitute-filtered-tables.perl
@@ -17,7 +17,7 @@ while(my $line = <FILTERED>) {
$feature_section = ($1 eq "feature");
}
next unless $feature_section;
- if ($line =~ /PhraseDictionary/) {
+ if ($line =~ /PhraseDictionary/ || $line =~ /RuleTable/) {
print STDERR "pt:$line \n";
push(@arr, $line);
}
@@ -36,7 +36,7 @@ while(my $line = <STDIN>) {
if ($line =~ /^\[(.+)\]/) {
$feature_section = ($1 eq "feature");
}
- if ($feature_section && $line =~ /PhraseDictionary/) {
+ if ($feature_section && ($line =~ /PhraseDictionary/ || $line =~ /RuleTable/)) {
print $arr[$ind]."\n";
++$ind;
}
diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl
new file mode 100755
index 000000000..e6f7839f1
--- /dev/null
+++ b/scripts/ems/support/thot-lm-wrapper.perl
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+
+my ($TEXT,$ORDER,$BIN,$LM,$TMP);
+
+&GetOptions('text=s' => \$TEXT,
+ 'lm=s' => \$LM,
+ 'tmp=s' => \$TMP,
+ 'bin=s' => \$BIN,
+ 'order=i' => \$ORDER);
+
+die("ERROR: specify --text CORPUS --lm LM --order N --bin THOT_BINARY !")
+ unless defined($TEXT) && defined($LM) && defined($ORDER) && defined($BIN);
+
+my $cmd = "$BIN -c $TEXT -n $ORDER -o $LM -unk -sdir $TMP -tdir $TMP";
+
+print "exec: $cmd\n";
+`$cmd`;
diff --git a/scripts/generic/moses_sim_pe.py b/scripts/generic/moses_sim_pe.py
index f77e7de6e..f4d640cfa 100755
--- a/scripts/generic/moses_sim_pe.py
+++ b/scripts/generic/moses_sim_pe.py
@@ -109,6 +109,8 @@ def main(argv):
n_best_out = None
n_best_size = None
n_best_distinct = False
+ hg_ext = None
+ hg_dir = None
tmp_dir = '/tmp'
xml_found = False
xml_input = 'exclusive'
@@ -149,6 +151,15 @@ def main(argv):
cmd = cmd[:i] + cmd[i + 4:]
else:
cmd = cmd[:i] + cmd[i + 3:]
+ elif cmd[i] == '-output-search-graph-hypergraph':
+ # cmd[i + 1] == true
+ hg_ext = cmd[i + 2]
+ if i + 3 < len(cmd) and cmd[i + 3][0] != '-':
+ hg_dir = cmd[i + 3]
+ cmd = cmd[:i] + cmd[i + 4:]
+ else:
+ hg_dir = 'hypergraph'
+ cmd = cmd[:i] + cmd[i + 3:]
elif cmd[i] == '-tmp':
tmp_dir = cmd[i + 1]
cmd = cmd[:i] + cmd[i + 2:]
@@ -177,6 +188,7 @@ def main(argv):
# Dynamic means update this model
if v.startswith('Dynamic'):
mmsapt_dynamic.append(v)
+ moses_ini_lines[i] += '{mmsapt_extra}'
else:
mmsapt_static.append(v)
elif k == 'L1':
@@ -189,7 +201,6 @@ def main(argv):
sys.stderr.write('Error: All PhraseDictionaryBitextSampling entries should have same L2: {} != {}\n'.format(v, mmsapt_l2))
sys.exit(1)
mmsapt_l2 = v
- moses_ini_lines[i] += '{mmsapt_extra}'
# [threads]
# 8
elif moses_ini_lines[i] == '[threads]':
@@ -251,6 +262,8 @@ def main(argv):
sys.stderr.write('Batch size: {}\n'.format(batch_size))
if n_best_out:
sys.stderr.write('N-best list: {} ({}{})\n'.format(n_best_out, n_best_size, ', distinct' if n_best_distinct else ''))
+ if hg_dir:
+ sys.stderr.write('Hypergraph dir: {} ({})\n'.format(hg_dir, hg_ext))
sys.stderr.write('Temp dir: {}\n'.format(work_dir))
# Accumulate seen lines
@@ -315,6 +328,11 @@ def main(argv):
work_cmd.append(str(n_best_size))
if n_best_distinct:
work_cmd.append('distinct')
+ if hg_dir:
+ work_cmd.append('-output-search-graph-hypergraph')
+ work_cmd.append('true')
+ work_cmd.append(hg_ext)
+ work_cmd.append(os.path.join(work_dir, 'hg.{}'.format(i)))
in_file = os.path.join(work_dir, 'input.{}.xml'.format(i))
out_file = os.path.join(work_dir, 'out.{}'.format(i))
err_file = os.path.join(work_dir, 'err.{}'.format(i))
@@ -333,6 +351,15 @@ def main(argv):
for line in open(os.path.join(work_dir, 'nbest.{}'.format(i)), 'r'):
entry = line.partition(' ')
out.write('{} {}'.format(int(entry[0]) + (i * batch_size), entry[2]))
+
+ # Gather hypergraphs
+ if hg_dir:
+ if not os.path.exists(hg_dir):
+ os.mkdir(hg_dir)
+ shutil.copy(os.path.join(work_dir, 'hg.0', 'weights'), os.path.join(hg_dir, 'weights'))
+ for i in range(threads):
+ for j in range(batch_size):
+ shutil.copy(os.path.join(work_dir, 'hg.{}'.format(i), '{}.{}'.format(j, hg_ext)), os.path.join(hg_dir, '{}.{}'.format((i * batch_size) + j, hg_ext)))
# Gather stdout
for i in range(threads):
diff --git a/scripts/nplm-training/README b/scripts/nplm-training/README
new file mode 100644
index 000000000..bf0666243
--- /dev/null
+++ b/scripts/nplm-training/README
@@ -0,0 +1,9 @@
+Example usage:
+#create training and test corpus
+/home/abmayne/code/deepathon/nnjm/extract_training.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus corpus/europarl.clean.10k --target-language cs --source-language en --align corpus/europarl.clean.10k.align
+/home/abmayne/code/deepathon/nnjm/extract_test.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus corpus/europarl.test.10k --target-language cs --source-language en --align corpus/europarl.test.10k.align
+
+#Train and test different language models with basic nplm training
+/home/abmayne/code/deepathon/nnjm/train_nplm.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus europarl.clean.10k --minibatch-size 128 --epochs 40 --output-model europarl.10k.bbn --nplm-home /home/abmayne/code/deepathon/nplm_one_layer --hidden 0 --threads 1 --output-model europarl.10k.1layer
+/home/abmayne/code/deepathon/nnjm/test_nplm.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus europarl.test.10k --train-corpus europarl.10k.1layer --nplm-home /home/abmayne/code/deepathon/nplm_one_layer --threads 1
+
diff --git a/scripts/nplm-training/averageNullEmbedding_baseline.py b/scripts/nplm-training/averageNullEmbedding_baseline.py
new file mode 100755
index 000000000..8fe616b46
--- /dev/null
+++ b/scripts/nplm-training/averageNullEmbedding_baseline.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python2
+import sys
+import numpy
+import optparse
+#sys.path.append('/data/tools/nplm/python')
+
+parser = optparse.OptionParser("%prog [options]")
+parser.add_option("-p", "--nplm-python-path", type="string", dest="nplm_python_path")
+parser.add_option("-i", "--input-model", type="string", dest="input_model")
+parser.add_option("-o", "--output-model", type="string", dest="output_model")
+parser.add_option("-n", "--null-token-index", type="int", dest="null_idx")
+parser.add_option("-t", "--training-ngrams", type="string", dest="training_ngrams")
+
+
+parser.set_defaults(
+ nplm_python_path = '/mnt/gna0/rsennrich/tools/nplm/python',
+ null_idx = 1
+)
+options,_ = parser.parse_args(sys.argv)
+
+sys.path.append(options.nplm_python_path)
+import nplm
+from collections import defaultdict
+
+def load_model(model_file):
+ return nplm.NeuralLM.from_file(model_file)
+
+def get_weights(path, length):
+ d = [0]*length
+ for line in open(path):
+ last_context = int(line.split()[-2])
+ d[last_context] += 1
+ return d
+
+if __name__ == "__main__":
+
+ a = load_model(options.input_model)
+ print 'before:'
+ print a.input_embeddings[options.null_idx]
+ weights = numpy.array(get_weights(options.training_ngrams, len(a.input_embeddings)))
+ a.input_embeddings[options.null_idx] = numpy.average(numpy.array(a.input_embeddings), weights=weights, axis=0)
+ print 'after:'
+ print a.input_embeddings[options.null_idx]
+ a.to_file(open(options.output_model,'w'))
diff --git a/scripts/nplm-training/extract.py b/scripts/nplm-training/extract.py
new file mode 100755
index 000000000..12a5860b2
--- /dev/null
+++ b/scripts/nplm-training/extract.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+from collections import Counter
+import heapq
+import logging
+import optparse
+import sys
+
+LOG = logging.getLogger(__name__)
+
+BOS = "<s>"
+EOS = "</s>"
+UNK = "<unk>"
+
+def replace_tags(tokens,tags,vocab):
+ for i,t in enumerate(tokens):
+ if not t in vocab:
+ if i < len(tags):
+ tokens[i] = tags[i]
+ else:
+ print "Error: missing tags for index i:", i
+ print ' '.join(tokens)
+ print ' '.join(tags)
+ tokens[i] = UNK
+
+def replace_unks(tokens,vocab):
+ for i,t in enumerate(tokens):
+ if not t in vocab:
+ tokens[i] = UNK
+
+
+def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang, m, n, ofh):
+ """
+ m - source context
+ n - target context
+
+ returns set of tags used
+ """
+ tags = Counter()
+ sfh = open(corpus_stem + "." + slang)
+ tfh = open(corpus_stem + "." + tlang)
+ afh = open(align_file)
+ fhs = [sfh,tfh,afh]
+ if tagged_stem:
+ fhs.append(open(tagged_stem + "." + slang))
+ fhs.append(open(tagged_stem + "." + tlang))
+
+ count = 0
+ ngrams = 0
+ LOG.info("Extracting ngrams")
+ for lines in zip(*fhs):
+ stokens = lines[0][:-1].split()
+ ttokens = lines[1][:-1].split()
+ stokens.append(EOS)
+ ttokens.append(EOS)
+ if tagged_stem:
+ stags = lines[3][:-1].split()
+ ttags = lines[4][:-1].split()
+ stags.append(EOS)
+ ttags.append(EOS)
+ tags.update(stags)
+ tags.update(ttags)
+ replace_tags(stokens,stags,svocab)
+ replace_tags(ttokens,ttags,tvocab)
+ else:
+ replace_unks(stokens,svocab)
+ replace_unks(ttokens,tvocab)
+ # list aligns for each target
+ # Note: align specifies source -> target
+ target_aligns = [[] for t in range(len(ttokens))]
+ for atoken in lines[2][:-1].split():
+ spos,tpos = atoken.split("-")
+ spos,tpos = int(spos), int(tpos)
+ target_aligns[tpos].append(spos)
+ #EOS alignment
+ target_aligns[-1] = [len(stokens)-1]
+
+ for tpos,spos_list in enumerate(target_aligns):
+ # Affiliation heuristics - see Devlin t al. p1371
+ if not spos_list:
+ #tpos has no alignment, look right, then left, then right-right, then left-left etc
+ rpos = tpos+1
+ lpos = tpos-1
+ while rpos < len(ttokens) or lpos >= 0:
+ if rpos < len(ttokens) and target_aligns[rpos]:
+ spos_list = target_aligns[rpos]
+ break
+ if lpos >= 0 and target_aligns[lpos]:
+ spos_list = target_aligns[lpos]
+ break
+ rpos += 1
+ lpos -= 1
+
+ if not spos_list:
+ raise Exception("No alignments in sentence \nSRC: " + lines[0][:-1] + "\nTGT: " + lines[1][:-1])
+ spos = (max(spos_list) + min(spos_list)) / 2
+
+
+ # source-context, target-context, predicted word
+ for i in range(max(0,m-spos)):
+ print>>ofh, BOS,
+ #print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
+ print>>ofh, " ".join([s for s in stokens[max(0,spos-m):spos+m+1]]),
+ for i in range(max(0,spos+m+1-len(stokens))):
+ print>>ofh, EOS,
+ for i in range(max(0,n-(tpos+1))):
+ print>>ofh, BOS,
+ print>>ofh, " ".join([t for t in ttokens[max(0,tpos+1-n):tpos+1]]),
+ print>>ofh
+ ngrams += 1
+
+
+ count += 1
+ if count % 1000 == 0: sys.stderr.write(".")
+ if count % 50000 == 0: sys.stderr.write(" [%d]\n" % count)
+ ofh.close()
+ sys.stderr.write("\n")
+ LOG.info("Extracted %d ngrams" % ngrams)
+ return tags
+
+
diff --git a/scripts/nplm-training/extract_test.py b/scripts/nplm-training/extract_test.py
new file mode 100755
index 000000000..c8325e511
--- /dev/null
+++ b/scripts/nplm-training/extract_test.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+#
+# Create a test corpus, using a previously pruned vocabulary.
+#
+
+import logging
+import optparse
+import os
+import os.path
+import sys
+
+import extract
+
+def read_vocab(filename):
+ vocab = set()
+ for line in open(filename):
+ vocab.add(line[:-1])
+ return vocab
+
+def main():
+ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option("-e", "--target-language", type="string", dest="target_language")
+ parser.add_option("-f", "--source-language", type="string", dest="source_language")
+ parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
+ parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
+ parser.add_option("-a", "--align", type="string", dest="align_file")
+ parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
+
+
+ parser.set_defaults(
+ target_language = "en",
+ source_language = "de",
+ corpus_stem = "test",
+ align_file = "test.align",
+ working_dir = "working",
+ )
+ options,args = parser.parse_args(sys.argv)
+ if not os.path.exists(options.working_dir):
+ LOG.error("Working directory '%s' not found" % working_dir)
+ sys.exit(1)
+
+ m,n = None,None
+ for line in open(options.working_dir + "/info"):
+ name,value = line[:-1].split()
+ if name == "m": m = int(value)
+ if name == "n": n = int(value)
+ if m == None or n == None:
+ LOG.error("info file is incomplete")
+ sys.exit(1)
+
+ svocab = read_vocab(options.working_dir + "/vocab.source")
+ tvocab = read_vocab(options.working_dir + "/vocab.target")
+
+ file_stem = os.path.basename(options.corpus_stem)
+ ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
+ extract.get_ngrams(options.corpus_stem,
+ options.align_file,
+ options.tagged_stem,
+ svocab,
+ tvocab,
+ options.source_language,
+ options.target_language,
+ m,
+ n,
+ ofh)
+
+
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/nplm-training/extract_training.py b/scripts/nplm-training/extract_training.py
new file mode 100755
index 000000000..af272786c
--- /dev/null
+++ b/scripts/nplm-training/extract_training.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+
+from collections import Counter
+import logging
+import optparse
+import os
+import os.path
+import sys
+
+import extract
+
+LOG = logging.getLogger(__name__)
+
+def get_pruned_vocab(corpus,prune):
+ counts = Counter()
+ LOG.info("Reading vocabulary from %s" % corpus)
+ lines = 0
+ for line in open(corpus):
+ for token in line[:-1].split():
+ counts[token] += 1
+ lines += 1
+ if lines % 1000 == 0: sys.stderr.write(".")
+ if lines % 50000 == 0: sys.stderr.write(" [%d]\n" % lines)
+ sys.stderr.write("\n")
+ counts[extract.BOS] += lines
+ counts[extract.EOS] += lines
+ LOG.info("Vocabulary size: %d" % len(counts))
+ if prune:
+ return Counter(dict(counts.most_common(prune)))
+ else:
+ return counts
+
+def save_vocab(directory, filename, vocab):
+ fh = open(directory + "/" + filename, "w")
+ for word in vocab:
+ print>>fh, word
+
+def main():
+ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option("-e", "--target-language", type="string", dest="target_language")
+ parser.add_option("-f", "--source-language", type="string", dest="source_language")
+ parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
+ parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
+ parser.add_option("-a", "--align", type="string", dest="align_file")
+ parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
+ parser.add_option("-n", "--target-context", type="int", dest="n")
+ parser.add_option("-m", "--source-context", type="int", dest="m")
+ parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
+ parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")
+
+
+ parser.set_defaults(
+ target_language = "en",
+ source_language = "de",
+ corpus_stem = "train.10k",
+ align_file = "train.10k.align",
+ n = 5,
+ m = 4,
+ working_dir = "working",
+ sprune=16000,
+ tprune=16000
+ )
+ options,args = parser.parse_args(sys.argv)
+
+ if not os.path.exists(options.working_dir):
+ os.makedirs(options.working_dir)
+ else:
+ LOG.warn("Directory %s already exists, re-using" % options.working_dir)
+
+ info_file = options.working_dir + "/info"
+ if os.path.exists(info_file):
+ for line in open(info_file):
+ name,value = line[:-1].split()
+ if name == "n" and int(value) != options.n or \
+ name == "m" and int(value) != options.m:
+ LOG.error("info file exists, but parameters do not match. Delete working directory and rerun")
+ sys.exit(1)
+ else:
+ ifh = open(info_file,"w")
+ print>>ifh,"m",options.m
+ print>>ifh,"n",options.n
+ ifh.close()
+
+ scorpus = options.corpus_stem + "." + options.source_language
+ tcorpus = options.corpus_stem + "." + options.target_language
+
+ tvocab,svocab = None,None
+ # Extract vocabulary, and prune, if required
+ svocab = get_pruned_vocab(scorpus,options.sprune)
+ tvocab = get_pruned_vocab(tcorpus,options.tprune)
+
+
+ file_stem = os.path.basename(options.corpus_stem)
+ ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
+ ofh = open(ngram_file, "w")
+
+ tags = extract.get_ngrams(options.corpus_stem,
+ options.align_file,
+ options.tagged_stem,
+ svocab,
+ tvocab,
+ options.source_language,
+ options.target_language,
+ options.m,
+ options.n,
+ ofh)
+
+ # Save vocabularies
+ del svocab["<null>"]
+ del tvocab["<null>"]
+ del svocab["<unk>"]
+ del tvocab["<unk>"]
+ svocab_list = [item[0] for item in svocab.most_common()]
+ tvocab_list = [item[0] for item in tvocab.most_common()]
+
+ # UNK is always the first vocabulary element. Make sure
+ # it appears in position 0
+ # We need to use <null> token in the chart decoder in order
+ # to correctly estimate the probabilities of incomplete subphrases
+ # that are not sentence initial.
+
+ tvocab_list.insert(0, "<null>")
+ tvocab_list.insert(0, "<unk>")
+ svocab_list.insert(0, "<unk>")
+
+ #Get tags:
+ tag_list = [item[0] for item in tags.most_common()]
+ svocab_list = svocab_list + tag_list
+ tvocab_list = tvocab_list + tag_list
+
+ save_vocab(options.working_dir, "vocab.source", svocab_list)
+ save_vocab(options.working_dir, "vocab.target", tvocab_list)
+
+ #Create vocab dictionaries that map word to ID
+ tvocab_idmap = {}
+ for i in range(len(tvocab_list)):
+ tvocab_idmap[tvocab_list[i]] = i
+
+ svocab_idmap = {}
+ for i in range(len(svocab_list)):
+ svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)
+
+ numberized_file = options.working_dir + "/" + file_stem + ".numberized"
+ ngrams_file_handle = open(ngram_file, 'r')
+ numberized_file_handle = open(numberized_file, 'w')
+
+ #Numberize the file
+ for line in ngrams_file_handle:
+ line = line.split()
+ source_words = line[:(2*options.m + 1)]
+ target_words = line[-options.n:]
+
+ numberized_line = ""
+ for item in source_words:
+ numberized_line = numberized_line + str(svocab_idmap[item]) + " "
+
+ for item in target_words:
+ numberized_line = numberized_line + str(tvocab_idmap[item]) + " "
+
+ #Write to file replacing the last space with new line
+ numberized_file_handle.write(numberized_line[:-1] + "\n")
+ numberized_file_handle.close()
+ ngrams_file_handle.close()
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/nplm-training/reduce_ngrams.py b/scripts/nplm-training/reduce_ngrams.py
new file mode 100755
index 000000000..65795a10c
--- /dev/null
+++ b/scripts/nplm-training/reduce_ngrams.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+"""Reduces an ngrams file for training nplm to a smaller version of it with less ngrams"""
+from sys import argv
+
+if len(argv) != 5:
+ print("Wrong number of args, got: " + str(len(argv) - 1) + " expected 4.")
+ print("Usage: reduce_ngrams.py INFILE OUTFILE START_IDX NGRAMS")
+ exit()
+
+INFILE = open(argv[1], 'r')
+OUTFILE = open(argv[2], 'w')
+START_IDX = int(argv[3])
+NGRAMS = int(argv[4])
+
+for line in INFILE:
+ line = line.split()
+ line = line[START_IDX:START_IDX+NGRAMS]
+ linetowrite = ""
+ for token in line:
+ linetowrite = linetowrite + token + " "
+ #Strip final empty space and add newline
+ linetowrite = linetowrite[:-1]
+ linetowrite = linetowrite + '\n'
+ OUTFILE.write(linetowrite)
+
+INFILE.close()
+OUTFILE.close()
diff --git a/scripts/nplm-training/tag.sh b/scripts/nplm-training/tag.sh
new file mode 100755
index 000000000..7a8e1dc70
--- /dev/null
+++ b/scripts/nplm-training/tag.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+WRAP_DIR=~/moses.new/scripts/training/wrappers/
+
+
+tagger=$WRAP_DIR/make-factor-en-pos.mxpost.perl
+lang=en
+for stem in test train.10k train.100k; do
+ $tagger -mxpost /home/pkoehn/statmt/project/mxpost $stem.$lang $stem.tagged.$lang /tmp
+done
+
+tagger=$WRAP_DIR/make-factor-de-pos.perl
+lang=de
+for stem in test train.10k train.100k; do
+ $tagger $stem.$lang $stem.tagged.$lang /tmp
+done
+
diff --git a/scripts/nplm-training/test_nplm.py b/scripts/nplm-training/test_nplm.py
new file mode 100755
index 000000000..51b8cebda
--- /dev/null
+++ b/scripts/nplm-training/test_nplm.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+import logging
+import optparse
+import subprocess
+import sys
+
+
+def main():
+ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option("-w", "--working-dir", dest="working_dir")
+ parser.add_option("-c", "--corpus", dest="corpus_stem")
+ parser.add_option("-r", "--train-corpus", dest="train_stem")
+ parser.add_option("-l", "--nplm-home", dest="nplm_home")
+ parser.add_option("-e", "--epoch", dest="epoch", type="int")
+ parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
+ parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
+ parser.add_option("-t", "--threads", dest="threads", type="int")
+
+ parser.set_defaults(
+ working_dir = "working"
+ ,corpus_stem = "test"
+ ,train_stem = "train.10k"
+ ,nplm_home = "/home/bhaddow/tools/nplm"
+ ,epoch=10
+ ,ngram_size = 14
+ ,minibatch_size=1000
+ ,threads=8
+ )
+
+ options,args = parser.parse_args(sys.argv)
+
+ model_prefix = options.working_dir + "/" + options.train_stem + ".model.nplm"
+ model_file = model_prefix + "." + str(options.epoch)
+ test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
+ prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
+ vocab_file = options.working_dir + "/vocab"
+
+ #TODO: Get ngram size from info file.
+ prep_args = [options.nplm_home + "/src/prepareNeuralLM", "--train_text", test_file, "--ngram_size",
+ str(options.ngram_size), "--ngramize", "0", "--words_file", vocab_file, "--train_file", prep_file]
+ ret = subprocess.call(prep_args)
+ if ret: raise Exception("Preparation failed")
+
+ test_args = [options.nplm_home + "/src/testNeuralNetwork", "--test_file", prep_file, "--model_file",
+ model_file , "--minibatch_size", str(options.minibatch_size), "--num_threads", str(options.threads)]
+ ret = subprocess.call(test_args)
+ if ret: raise Exception("Testing failed")
+
+#$ROOT/src/prepareNeuralLM --train_text $TESTFILE1 --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
+
+#$ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
+
+if __name__ == "__main__":
+ main()
+
diff --git a/scripts/nplm-training/train_nplm.py b/scripts/nplm-training/train_nplm.py
new file mode 100755
index 000000000..1af6cf325
--- /dev/null
+++ b/scripts/nplm-training/train_nplm.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+import logging
+import optparse
+import subprocess
+import sys
+import os
+
+def main():
+ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option("-w", "--working-dir", dest="working_dir")
+ parser.add_option("-c", "--corpus", dest="corpus_stem")
+ parser.add_option("-l", "--nplm-home", dest="nplm_home")
+ parser.add_option("-e", "--epochs", dest="epochs", type="int")
+ parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
+ parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
+ parser.add_option("-s", "--noise", dest="noise", type="int")
+ parser.add_option("-d", "--hidden", dest="hidden", type="int")
+ parser.add_option("-i", "--input-embedding", dest="input_embedding", type="int")
+ parser.add_option("-o", "--output-embedding", dest="output_embedding", type="int")
+ parser.add_option("-t", "--threads", dest="threads", type="int")
+ parser.add_option("-m", "--output-model", dest="output_model")
+ parser.add_option("-r", "--output-dir", dest="output_dir")
+ parser.add_option("-f", "--config-options-file", dest="config_options_file")
+ parser.add_option("-g", "--log-file", dest="log_file")
+ parser.add_option("-v", "--validation-ngrams", dest="validation_file")
+ parser.add_option("-a", "--activation-function", dest="activation_fn")
+
+ parser.set_defaults(
+ working_dir = "working"
+ ,corpus_stem = "train.10k"
+ ,nplm_home = "/home/bhaddow/tools/nplm"
+ ,epochs = 10
+ ,ngram_size = 14
+ ,minibatch_size=1000
+ ,noise=100
+ ,hidden=750
+ ,input_embedding=150
+ ,output_embedding=150
+ ,threads=1
+ ,output_model = "train.10k"
+ ,output_dir = None
+ ,config_options_file = "config"
+ ,log_file = "log"
+ ,validation_file = None
+ ,activation_fn = "rectifier"
+ )
+
+ options,args = parser.parse_args(sys.argv)
+
+ # Set up validation command variable to use with validation set.
+ validations_command = []
+ if options.validation_file is not None:
+ validations_command =["--validation_file", (options.validation_file + ".numberized")]
+
+
+ # In order to allow for different models to be trained after the same
+ # preparation step, we should provide an option for multiple output directories
+ # If we have not set output_dir, set it to the same thing as the working dir
+
+ if options.output_dir is None:
+ options.output_dir = options.working_dir
+ else:
+ # Create output dir if necessary
+ if not os.path.exists(options.output_dir):
+ os.makedirs(options.output_dir)
+
+ config_file = options.output_dir + "/" + options.config_options_file + '-' + options.output_model
+ log_file = options.output_dir + "/" + options.log_file + '-' + options.output_model
+ log_file_write = open(log_file, 'w')
+ config_file_write = open(config_file, 'w')
+
+ config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
+
+ in_file = options.working_dir + "/" + options.corpus_stem + ".numberized"
+
+
+ model_prefix = options.output_dir + "/" + options.output_model + ".model.nplm"
+ train_args = [options.nplm_home + "/src/trainNeuralNetwork", "--train_file", in_file, "--num_epochs", str(options.epochs),
+ "--model_prefix",
+ model_prefix, "--learning_rate", "1", "--minibatch_size", str(options.minibatch_size),
+ "--num_noise_samples", str(options.noise), "--num_hidden", str(options.hidden), "--input_embedding_dimension",
+ str(options.input_embedding), "--output_embedding_dimension", str(options.output_embedding), "--num_threads",
+ str(options.threads), "--activation_function", options.activation_fn] + validations_command
+ print "Train model command: "
+ print ', '.join(train_args)
+
+ config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
+ config_file_write.close()
+
+ log_file_write.write("Training output:\n")
+ ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write)
+ if ret: raise Exception("Training failed")
+
+ log_file_write.close()
+
+if __name__ == "__main__":
+ main()
+
+
+
+
+#EPOCHS=10
+#NGRAM_SIZE=14
+#MINIBATCH_SIZE=1000
+#NOISE=100
+#HIDDEN=750
+#INPUT_EMBEDDING=150
+#OUTPUT_EMBEDDING=150
+#THREADS=8
+#
+
+#$ROOT/src/prepareNeuralLM --train_text $INFILE --ngram_size $NGRAM_SIZE --ngramize 0 --words_file $VOCAB --train_file $WORKDIR/train.ngrams || exit 1
+
+#$ROOT/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams \
+# --num_epochs $EPOCHS --input_words_file $VOCAB --output_words_file $VOCAB --model_prefix $WORKDIR/$PREFIX \
+# --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
+# --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
+
+
diff --git a/scripts/other/blame-stat.sh b/scripts/other/blame-stat.sh
new file mode 100755
index 000000000..7ceddfc5d
--- /dev/null
+++ b/scripts/other/blame-stat.sh
@@ -0,0 +1,4 @@
+git ls-files | xargs -n1 git blame --line-porcelain | sed -n 's/^author //p' | sort -f | uniq -ic | sort -nr
+
+#git ls-files | grep -Ei "\.h$|\.cpp$|\.hh$|\.cc$" | xargs -n1 git blame --line-porcelain | sed -n 's/^author //p' | sort -f | uniq -ic | sort -nr
+
diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl
index fa833dbd6..b12aa6147 100755
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@@ -118,7 +118,7 @@ sub train_lm {
}
else {
$LM = "KENLM";
- $cmd = "$BUILD_KENLM --prune 0 0 1 -S 50% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.gz";
+ $cmd = "$BUILD_KENLM --prune 0 0 1 -S 5% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.gz";
}
print STDERR "** Using $LM **" . "\n";
print STDERR $cmd."\n";
diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl
new file mode 100755
index 000000000..ca4e8a1b3
--- /dev/null
+++ b/scripts/tokenizer/deescape-special-chars-PTB.perl
@@ -0,0 +1,19 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+ s/\&bar;/\|/g; # factor separator (legacy)
+ s/\&#124;/\|/g; # factor separator
+ s/\&lt;/\</g; # xml
+ s/\&gt;/\>/g; # xml
+ s/\&bra;/\[/g; # syntax non-terminal (legacy)
+ s/\&ket;/\]/g; # syntax non-terminal (legacy)
+ s/\&quot;/\"/g; # xml
+ s/\&apos;/\'/g; # xml
+ s/\&#91;/\[/g; # syntax non-terminal
+ s/\&#93;/\]/g; # syntax non-terminal
+ s/\&amp;/\&/g; # escape escape
+ s/\"([^\"]*)\"/\`\`$1\'\'/g;
+ print $_;
+}
diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl
new file mode 100755
index 000000000..006fb9c2d
--- /dev/null
+++ b/scripts/tokenizer/tokenizer_PTB.perl
@@ -0,0 +1,399 @@
+#!/usr/bin/perl -w
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+use Thread;
+
+my $mydir = "$RealBin/../share/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+
+while (@ARGV)
+{
+ $_ = shift;
+ /^-b$/ && ($| = 1, next);
+ /^-l$/ && ($language = shift, next);
+ /^-q$/ && ($QUIET = 1, next);
+ /^-h$/ && ($HELP = 1, next);
+ /^-x$/ && ($SKIP_XML = 1, next);
+ /^-a$/ && ($AGGRESSIVE = 1, next);
+ /^-time$/ && ($TIMING = 1, next);
+ /^-threads$/ && ($NUM_THREADS = int(shift), next);
+ /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+ $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP)
+{
+ print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+ print "Options:\n";
+ print " -q ... quiet.\n";
+ print " -a ... aggressive hyphen splitting.\n";
+ print " -b ... disable Perl buffering.\n";
+ print " -time ... enable processing time calculation.\n";
+ exit;
+}
+
+if (!$QUIET)
+{
+ print STDERR "Tokenizer Version 1.1\n";
+ print STDERR "Language: $language\n";
+ print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+ print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+ while(<STDIN>)
+ {
+ $count_sentences = $count_sentences + 1;
+ push(@batch_sentences, $_);
+ if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ # reset for the new run
+ @thread_list = ();
+ @batch_sentences = ();
+ }
+ }
+ # the last batch
+ if (scalar(@batch_sentences)>0)
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ if ($start_index >= scalar(@batch_sentences))
+ {
+ last;
+ }
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ if ($end_index >= scalar(@batch_sentences))
+ {
+ $end_index = scalar(@batch_sentences)-1;
+ }
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ }
+}
+else
+{# single thread only
+ while(<STDIN>)
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ print $_;
+ }
+ else
+ {
+ print &tokenize($_);
+ }
+ }
+}
+
+if ($TIMING)
+{
+ my $duration = Time::HiRes::tv_interval( $start_time );
+ print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+ print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array cotaining a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+ my(@text_list) = @_;
+ my(@tokenized_list) = ();
+ foreach (@text_list)
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ push(@tokenized_list, $_);
+ }
+ else
+ {
+ push(@tokenized_list, &tokenize($_));
+ }
+ }
+ return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+ my($text) = @_;
+
+ #clean some stuff so you don't get &amp; -> &amp;amp;
+ #news-commentary stuff
+
+ $text =~ s/\&#45;/ /g;
+ $text =~ s/\&45;/ /g;
+ $text =~ s/\&#160;/ /g;
+ $text =~ s/\&gt;/\>/g;
+ $text =~ s/\&lt;/\</g;
+ $text =~ s/ampquot;/\"/g;
+ $text =~ s/ampquot/\"/g;
+ $text =~ s/\&quot;/\"/g;
+ $text =~ s/\&amp;/\&/g;
+ $text =~ s/\&nbsp;/ /g;
+ $text =~ s/\&#91;/\[/g; # syntax non-terminal
+ $text =~ s/\&#93;/\]/g; # syntax non-terminal
+ $text =~ s/\&bar;/\|/g; # factor separator (legacy)
+ $text =~ s/\&#124;/\|/g; # factor separator
+ $text =~ s/(\.){4,}/ /g; #remove junk like ........
+ $text =~ s/--/ -- /g;
+
+ chomp($text);
+ $text = " $text ";
+
+ # remove ASCII junk
+ $text =~ s/\s+/ /g;
+ $text =~ s/[\000-\037]//g;
+
+ # seperate out all "other" special characters
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+
+ # aggressive hyphen splitting
+ if ($AGGRESSIVE)
+ {
+ $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+ }
+
+ #multi-dots stay together
+ $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+ while($text =~ /DOTMULTI\./)
+ {
+ $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+ $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+ }
+
+ # seperate out "," except if within numbers (5,300)
+ $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ # separate , pre and post number
+ $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+ # turn `into '
+ $text =~ s/\`/\'/g;
+
+ #turn '' into "
+ $text =~ s/\'\'/ \" /g;
+
+ if ($language eq "en")
+ {
+ #split contractions right
+ # $text =~ s/ [']([\p{IsAlpha}])/ '$1/g; #MARIA: is pretokenized for parsing vb'll -> vb 'll
+ $text =~ s/([Dd])'ye/$1o you/g;
+ $text =~ s/([Dd])'you/$1o you/g;
+ $text =~ s/'Tis/It is/g;
+ $text =~ s/'tis/it is/g;
+ $text =~ s/'Twas/It was/g;
+ $text =~ s/'twas/it is/g;
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])['][ ]([sSmMdDtT]\s)/$1 '$2/g; # Commissioner' s
+ $text =~ s/([\p{IsAlpha}])['][ ](ll|ve) /$1 '$2 /g; # I' ve I' ll
+ $text =~ s/ ['] ([sSmMdDtT]\s)/ '$1/g; # Maria 's -> Maria ' s -> Maria 's
+ $text =~ s/ ['] (ll|ve) / '$1 /g; # I 'll I 've
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+ #$text =~ s/ ['] ([\p{IsAlpha}])/ '$1/g; # I 'll 1999 's
+ $text =~ s/([\p{IsAlpha}])n [']t/$1 n't/g; #don't -> do n't (don't first splits into don 't)
+ $text =~ s/([\p{IsAlpha}])n ['] t/$1 n't/g;
+ $text =~ s/([\p{IsAlpha}])n [']t/$1 n't/g;
+ $text =~ s/([\p{IsAlpha}])N [']T/$1 N'T/g;
+ #special case for "1990's"
+ $text =~ s/([\p{IsN}])[']s/$1 's/g;
+ $text =~ s/([\p{IsN}]) [']s/$1 's/g;
+ $text =~ s/([\p{IsN}]) ['] s/$1 's/g;
+
+
+
+ #other english contractions -> from PTB tokenizer.sed
+ $text =~ s/([Cc])annot/$1an not/g;
+ $text =~ s/([Gg])imme/$1im me/g;
+ $text =~ s/([Gg])onna/$1on na/g;
+ $text =~ s/([Gg])otta/$1ot ta/g;
+ $text =~ s/([Ll])emme/$1em me/g;
+ $text =~ s/([Ww])anna/$1an na/g;
+ $text =~ s/([Dd]) 'ye/$1' ye/g;
+
+ }
+ elsif (($language eq "fr") or ($language eq "it"))
+ {
+ #split contractions left
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+ }
+ else
+ {
+ $text =~ s/\'/ \' /g;
+ }
+
+ #word token method
+ my @words = split(/\s/,$text);
+ $text = "";
+ for (my $i=0;$i<(scalar(@words));$i++)
+ {
+ my $word = $words[$i];
+ if ( $word =~ /^(\S+)\.$/)
+ {
+ my $pre = $1;
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+ {
+ #no change
+ }
+ elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+ {
+ #no change
+ }
+ else
+ {
+ $word = $pre." .";
+ }
+ }
+ $text .= $word." ";
+ }
+
+ # clean up extraneous spaces
+ $text =~ s/ +/ /g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ #restore multi-dots
+ while($text =~ /DOTDOTMULTI/)
+ {
+ $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+ }
+ $text =~ s/DOTMULTI/./g;
+
+ #escape special chars
+ $text =~ s/\&/\&amp;/g; # escape escape
+ $text =~ s/\|/\&#124;/g; # factor separator
+ $text =~ s/\</\&lt;/g; # xml
+ $text =~ s/\>/\&gt;/g; # xml
+ $text =~ s/\'/\&apos;/g; # xml
+ $text =~ s/\"/\&quot;/g; # xml
+ $text =~ s/\[/\&#91;/g; # syntax non-terminal
+ $text =~ s/\]/\&#93;/g; # syntax non-terminal
+
+ #ensure final line break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ return $text;
+}
+
+sub load_prefixes
+{
+ my ($language, $PREFIX_REF) = @_;
+
+ my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+ #default back to English if we don't have a language-specific prefix file
+ if (!(-e $prefixfile))
+ {
+ $prefixfile = "$mydir/nonbreaking_prefix.en";
+ print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+ die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+ }
+
+ if (-e "$prefixfile")
+ {
+ open(PREFIX, "<:utf8", "$prefixfile");
+ while (<PREFIX>)
+ {
+ my $item = $_;
+ chomp($item);
+ if (($item) && (substr($item,0,1) ne "#"))
+ {
+ if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+ {
+ $PREFIX_REF->{$1} = 2;
+ }
+ else
+ {
+ $PREFIX_REF->{$item} = 1;
+ }
+ }
+ }
+ close(PREFIX);
+ }
+}
+
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index fdb1ad53f..895b64b96 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -113,6 +113,7 @@ while(my $line = <INI>) {
|| $line =~ /PhraseDictionaryBinary /
|| $line =~ /PhraseDictionaryOnDisk /
|| $line =~ /PhraseDictionarySCFG /
+ || $line =~ /RuleTable /
) {
print STDERR "pt:$line\n";
@@ -143,7 +144,7 @@ while(my $line = <INI>) {
}
} #for (my $i = 1; $i < scalar(@toks); ++$i) {
- if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG") || $file =~ /glue-grammar/ || $skip) {
+ if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG" && $phrase_table_impl ne "RuleTable") || $file =~ /glue-grammar/ || $skip) {
# Only Memory ("0") and NewFormat ("6") can be filtered.
print INI_OUT "$line\n";
next;
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 55e9106b7..25d12a8ab 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -396,16 +396,16 @@ if (!defined $mertargs) {
}
my $scconfig = undef;
-if ($mertargs =~ /\-\-scconfig\s+(.+?)(\s|$)/) {
+if ($mertargs =~ /\-\-scconfig(?:\s+|=)(.+?)(\s|$)/) {
$scconfig = $1;
$scconfig =~ s/\,/ /g;
- $mertargs =~ s/\-\-scconfig\s+(.+?)(\s|$)//;
+ $mertargs =~ s/\-\-scconfig(?:\s+|=)(.+?)(\s|$)//;
}
my $sctype = "--sctype BLEU";
-if ($mertargs =~ /(\-\-sctype\s+.+?)(\s|$)/) {
+if ($mertargs =~ /(\-\-sctype(?:\s+|=).+?)(\s|$)/) {
$sctype = $1;
- $mertargs =~ s/(\-\-sctype\s+.+?)(\s|$)//;
+ $mertargs =~ s/(\-\-sctype(?:\s+|=)+.+?)(\s|$)//;
}
@@ -432,7 +432,7 @@ $proargs = "" unless $proargs;
my $mert_mert_args = "$mertargs $mertmertargs";
$mert_mert_args =~ s/\-+(binary|b)\b//;
-$mert_mert_args .= " $scconfig";
+$mert_mert_args .= "$sctype $scconfig";
if ($___ACTIVATE_FEATURES) {
$mert_mert_args .= " -o \"$___ACTIVATE_FEATURES\"";
}
@@ -1647,7 +1647,7 @@ sub create_extractor_script() {
open my $out, '>', $script_path
or die "Couldn't open $script_path for writing: $!\n";
- print $out "#!/bin/bash\n";
+ print $out "#!/usr/bin/env bash\n";
print $out "cd $outdir\n";
print $out "$cmd\n";
close $out;
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 06b1ec45d..ffc1e09c7 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
- $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
+ $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,$_S2T,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
@@ -105,6 +105,7 @@ $_HELP = 1
'generation-type=s' => \@_GENERATION_TYPE,
'continue' => \$_CONTINUE,
'hierarchical' => \$_HIERARCHICAL,
+ 's2t' => \$_S2T,
'glue-grammar' => \$_GLUE_GRAMMAR,
'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE,
'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE,
@@ -1980,6 +1981,10 @@ sub create_ini {
$phrase_table_impl_name = "PhraseDictionaryBitextSampling" if $phrase_table_impl==11;
$file .= "/" if $phrase_table_impl==11 && $file !~ /\/$/;
+ if ($_S2T) {
+ $phrase_table_impl_name = "RuleTable";
+ }
+
# table limit (maximum number of translation options per input phrase)
my $table_limit = 0;
if ($i == 0) {
@@ -2011,7 +2016,11 @@ sub create_ini {
# glue grammar
if ($_GLUE_GRAMMAR) {
&full_path(\$___GLUE_GRAMMAR_FILE);
- $feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i num-features=1 path=$___GLUE_GRAMMAR_FILE input-factor=0 output-factor=0\n";
+ my $feature_name = "PhraseDictionaryMemory";
+ if ($_S2T) {
+ $feature_name = "RuleTable";
+ }
+ $feature_spec .= "$feature_name name=TranslationModel$i num-features=1 path=$___GLUE_GRAMMAR_FILE input-factor=0 output-factor=0\n";
$weight_spec .= "TranslationModel$i= 1.0\n";
}
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
new file mode 100755
index 000000000..e447ee146
--- /dev/null
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
@@ -0,0 +1,54 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+ if (/^\(\(\)\)/) {
+ print "\n"; # parse failures
+ next;
+ }
+
+ # prep
+ s/^\( /\(TOP /;
+
+ # escape words
+ s/\&/\&amp;/g; # escape escape
+ s/\|/\&bar;/g; # factor separator
+ s/\|/\&#124;/g; # factor separator
+ s/\</\&lt;/g; # xml
+ s/\>/\&gt;/g; # xml
+ s/\'\'/\&quot;/g;
+ s/``/\&quot;/g;
+ s/\'/\&apos;/g; # xml
+ s/\"/\&quot;/g; # xml
+ s/\[/\&#91;/g; # syntax non-terminal
+ s/\]/\&#93;/g; # syntax non-terminal
+
+
+ # escape parentheses that were part of the input text
+ s/(\(\S+ )\(\)/$1\&openingparenthesis;\)/g;
+ s/(\(\S+ )\)\)/$1\&closingparenthesis;\)/g;
+
+
+
+ # convert into tree
+ s/\((\S+) /<tree label=\"$1\"> /g;
+ s/\)/ <\/tree> /g;
+ s/\"\-LRB\-\"/\"LRB\"/g; # labels
+ s/\"\-RRB\-\"/\"RRB\"/g;
+ s/\-LRB\-/\(/g; # tokens
+ s/\-RRB\-/\)/g;
+ s/ +/ /g;
+ s/ $//g;
+
+ # de-escape parentheses that were part of the input text
+ s/\&openingparenthesis;/\(/g;
+ s/\&closingparenthesis;/\)/g;
+
+ s/tree label=\"\&quot;\"/tree label=\"QUOT\"/g;
+ #s/tree label=\"''\"/tree label=\"QUOT\"/g;
+ #s/tree label=\"``\"/tree label=\"QUOT\"/g;
+
+ # output, replace words with original
+ print $_;
+}
diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
index 13aa7f912..38e331737 100755
--- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
+++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
@@ -34,6 +34,7 @@ sub read_cluster_from_mkcls {
my ($file) = @_;
my %CLUSTER;
open(CLUSTER_FILE,$file) || die("ERROR: could not open cluster file '$file'");
+ binmode(CLUSTER_FILE, ":utf8");
while(<CLUSTER_FILE>) {
chop;
my ($word,$cluster) = split;
@@ -45,3 +46,5 @@ sub read_cluster_from_mkcls {
sub add_cluster_to_string {
}
+
+
diff --git a/search/edge_generator.hh b/search/edge_generator.hh
index 203942c6f..048f9f9ac 100644
--- a/search/edge_generator.hh
+++ b/search/edge_generator.hh
@@ -8,7 +8,7 @@
namespace lm {
namespace ngram {
-class ChartState;
+struct ChartState;
} // namespace ngram
} // namespace lm
diff --git a/search/types.hh b/search/types.hh
index f9c849b3f..832ef159f 100644
--- a/search/types.hh
+++ b/search/types.hh
@@ -3,7 +3,7 @@
#include <stdint.h>
-namespace lm { namespace ngram { class ChartState; } }
+namespace lm { namespace ngram { struct ChartState; } }
namespace search {
diff --git a/search/vertex_generator.hh b/search/vertex_generator.hh
index 6fce508d6..328da7933 100644
--- a/search/vertex_generator.hh
+++ b/search/vertex_generator.hh
@@ -7,7 +7,7 @@
namespace lm {
namespace ngram {
-class ChartState;
+struct ChartState;
} // namespace ngram
} // namespace lm
diff --git a/util/file.cc b/util/file.cc
index 25ff8183a..aa61cf9a9 100644
--- a/util/file.cc
+++ b/util/file.cc
@@ -41,9 +41,9 @@ scoped_fd::~scoped_fd() {
}
}
-scoped_FILE::~scoped_FILE() {
- if (file_ && std::fclose(file_)) {
- std::cerr << "Could not close file " << std::endl;
+void scoped_FILE_closer::Close(std::FILE *file) {
+ if (file && std::fclose(file)) {
+ std::cerr << "Could not close file " << file << std::endl;
std::abort();
}
}
diff --git a/util/file.hh b/util/file.hh
index f2bb319d5..7204b6a04 100644
--- a/util/file.hh
+++ b/util/file.hh
@@ -2,6 +2,7 @@
#define UTIL_FILE_H
#include "util/exception.hh"
+#include "util/scoped.hh"
#include "util/string_piece.hh"
#include <cstddef>
@@ -42,29 +43,10 @@ class scoped_fd {
scoped_fd &operator=(const scoped_fd &);
};
-class scoped_FILE {
- public:
- explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {}
-
- ~scoped_FILE();
-
- std::FILE *get() { return file_; }
- const std::FILE *get() const { return file_; }
-
- void reset(std::FILE *to = NULL) {
- scoped_FILE other(file_);
- file_ = to;
- }
-
- std::FILE *release() {
- std::FILE *ret = file_;
- file_ = NULL;
- return ret;
- }
-
- private:
- std::FILE *file_;
+struct scoped_FILE_closer {
+ static void Close(std::FILE *file);
};
+typedef scoped<std::FILE, scoped_FILE_closer> scoped_FILE;
/* Thrown for any operation where the fd is known. */
class FDException : public ErrnoException {
diff --git a/util/scoped.cc b/util/scoped.cc
index 6c5b0c2db..de1d9e940 100644
--- a/util/scoped.cc
+++ b/util/scoped.cc
@@ -32,10 +32,6 @@ void *CallocOrThrow(std::size_t requested) {
return InspectAddr(std::calloc(1, requested), requested, "calloc");
}
-scoped_malloc::~scoped_malloc() {
- std::free(p_);
-}
-
void scoped_malloc::call_realloc(std::size_t requested) {
p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc");
}
diff --git a/util/scoped.hh b/util/scoped.hh
index ae70b6b53..60c36c36a 100644
--- a/util/scoped.hh
+++ b/util/scoped.hh
@@ -4,6 +4,7 @@
#include "util/exception.hh"
#include <cstddef>
+#include <cstdlib>
namespace util {
@@ -16,87 +17,91 @@ class MallocException : public ErrnoException {
void *MallocOrThrow(std::size_t requested);
void *CallocOrThrow(std::size_t requested);
-class scoped_malloc {
+/* Unfortunately, defining the operator* for void * makes the compiler complain.
+ * So scoped is specialized to void. This includes the functionality common to
+ * both, namely everything except reference.
+ */
+template <class T, class Closer> class scoped_base {
public:
- scoped_malloc() : p_(NULL) {}
+ explicit scoped_base(T *p = NULL) : p_(p) {}
- scoped_malloc(void *p) : p_(p) {}
+ ~scoped_base() { Closer::Close(p_); }
- ~scoped_malloc();
-
- void reset(void *p = NULL) {
- scoped_malloc other(p_);
+ void reset(T *p = NULL) {
+ scoped_base other(p_);
p_ = p;
}
- void call_realloc(std::size_t to);
-
- void *get() { return p_; }
- const void *get() const { return p_; }
-
- private:
- void *p_;
-
- scoped_malloc(const scoped_malloc &);
- scoped_malloc &operator=(const scoped_malloc &);
-};
-
-// Hat tip to boost.
-template <class T> class scoped_array {
- public:
- explicit scoped_array(T *content = NULL) : c_(content) {}
-
- ~scoped_array() { delete [] c_; }
-
- T *get() { return c_; }
- const T* get() const { return c_; }
+ T *get() { return p_; }
+ const T *get() const { return p_; }
- T &operator*() { return *c_; }
- const T&operator*() const { return *c_; }
+ T *operator->() { return p_; }
+ const T *operator->() const { return p_; }
- T &operator[](std::size_t idx) { return c_[idx]; }
- const T &operator[](std::size_t idx) const { return c_[idx]; }
-
- void reset(T *to = NULL) {
- scoped_array<T> other(c_);
- c_ = to;
+ T *release() {
+ T *ret = p_;
+ p_ = NULL;
+ return ret;
}
- private:
- T *c_;
+ protected:
+ T *p_;
- scoped_array(const scoped_array &);
- void operator=(const scoped_array &);
+ private:
+ scoped_base(const scoped_base &);
+ scoped_base &operator=(const scoped_base &);
};
-template <class T> class scoped_ptr {
+template <class T, class Closer> class scoped : public scoped_base<T, Closer> {
public:
- explicit scoped_ptr(T *content = NULL) : c_(content) {}
+ explicit scoped(T *p = NULL) : scoped_base<T, Closer>(p) {}
- ~scoped_ptr() { delete c_; }
+ T &operator*() { return *scoped_base<T, Closer>::p_; }
+ const T&operator*() const { return *scoped_base<T, Closer>::p_; }
+};
- T *get() { return c_; }
- const T* get() const { return c_; }
+template <class Closer> class scoped<void, Closer> : public scoped_base<void, Closer> {
+ public:
+ explicit scoped(void *p = NULL) : scoped_base<void, Closer>(p) {}
+};
- T &operator*() { return *c_; }
- const T&operator*() const { return *c_; }
+/* Closer for c functions like std::free and cmph cleanup functions */
+template <class T, void (*clean)(T*)> struct scoped_c_forward {
+ static void Close(T *p) { clean(p); }
+};
+// Call a C function to delete stuff
+template <class T, void (*clean)(T*)> class scoped_c : public scoped<T, scoped_c_forward<T, clean> > {
+ public:
+ explicit scoped_c(T *p = NULL) : scoped<T, scoped_c_forward<T, clean> >(p) {}
+};
- T *operator->() { return c_; }
- const T*operator->() const { return c_; }
+class scoped_malloc : public scoped_c<void, std::free> {
+ public:
+ explicit scoped_malloc(void *p = NULL) : scoped_c<void, std::free>(p) {}
- T &operator[](std::size_t idx) { return c_[idx]; }
- const T &operator[](std::size_t idx) const { return c_[idx]; }
+ void call_realloc(std::size_t to);
+};
- void reset(T *to = NULL) {
- scoped_ptr<T> other(c_);
- c_ = to;
- }
+/* scoped_array using delete[] */
+struct scoped_delete_array_forward {
+ template <class T> static void Close(T *p) { delete [] p; }
+};
+// Hat tip to boost.
+template <class T> class scoped_array : public scoped<T, scoped_delete_array_forward> {
+ public:
+ explicit scoped_array(T *p = NULL) : scoped<T, scoped_delete_array_forward>(p) {}
- private:
- T *c_;
+ T &operator[](std::size_t idx) { return scoped<T, scoped_delete_array_forward>::p_[idx]; }
+ const T &operator[](std::size_t idx) const { return scoped<T, scoped_delete_array_forward>::p_[idx]; }
+};
- scoped_ptr(const scoped_ptr &);
- void operator=(const scoped_ptr &);
+/* scoped_ptr using delete. If only there were a template typedef. */
+struct scoped_delete_forward {
+ template <class T> static void Close(T *p) { delete p; }
+};
+template <class T> class scoped_ptr : public scoped<T, scoped_delete_forward> {
+ public:
+ explicit scoped_ptr(T *p = NULL) : scoped<T, scoped_delete_forward>(p) {}
};
} // namespace util