Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicola Bertoldi <bertoldi@fbk.eu>2014-12-13 14:52:47 +0300
committerNicola Bertoldi <bertoldi@fbk.eu>2014-12-13 14:52:47 +0300
commite4eb201c52be74fee74399a6f35fcbe8eb85d834 (patch)
tree7792ef96d63262f6e28f1857741e1162c7dccbc4
parentcea2d9d8bb34a81660974cae20d66aefec4e0468 (diff)
parenta0b6b6a341e74b47bbef4652ad7fd928cf91e17c (diff)
merged master into dynamic-models and solved conflicts
-rw-r--r--.gitignore2
-rw-r--r--BUILD-INSTRUCTIONS.txt1
-rw-r--r--Jamroot71
-rw-r--r--NOTICE5
-rw-r--r--OnDiskPt/Jamfile4
-rw-r--r--OnDiskPt/Main.cpp44
-rw-r--r--OnDiskPt/Main.h2
-rw-r--r--OnDiskPt/OnDiskWrapper.cpp2
-rw-r--r--OnDiskPt/TargetPhrase.cpp70
-rw-r--r--OnDiskPt/TargetPhrase.h12
-rw-r--r--OnDiskPt/Word.cpp20
-rw-r--r--OnDiskPt/queryOnDiskPt.cpp2
-rw-r--r--biconcor/Jamfile2
-rw-r--r--biconcor/phrase-lookup.cpp132
-rw-r--r--contrib/makemteval/makemteval.ini12
-rw-r--r--contrib/makemteval/makemteval.py253
-rw-r--r--contrib/moses-speedtest/README.md122
-rw-r--r--contrib/moses-speedtest/check_for_regression.py63
-rw-r--r--contrib/moses-speedtest/cronjob7
-rw-r--r--contrib/moses-speedtest/helpers/README.md5
-rw-r--r--contrib/moses-speedtest/helpers/sys_drop_caches.py22
-rw-r--r--contrib/moses-speedtest/html/README.md5
-rw-r--r--contrib/moses-speedtest/html/index.html32
-rw-r--r--contrib/moses-speedtest/html/style.css21
-rw-r--r--contrib/moses-speedtest/html_gen.py192
-rw-r--r--contrib/moses-speedtest/runtests.py293
-rw-r--r--contrib/moses-speedtest/sys_drop_caches.py22
-rw-r--r--contrib/moses-speedtest/test_config3
-rw-r--r--contrib/moses-speedtest/testsuite_common.py54
-rw-r--r--contrib/moses-speedtest/testsuite_config5
-rw-r--r--contrib/other-builds/CreateOnDiskPt/.cproject (renamed from contrib/other-builds/moses-chart-cmd/.cproject)176
-rw-r--r--contrib/other-builds/CreateOnDiskPt/.project44
-rw-r--r--contrib/other-builds/consolidate/.cproject159
-rw-r--r--contrib/other-builds/consolidate/.project (renamed from contrib/other-builds/extract-ordering/.project)19
-rw-r--r--contrib/other-builds/extract-ghkm/.cproject128
-rw-r--r--contrib/other-builds/extract-ghkm/.project110
-rw-r--r--contrib/other-builds/extract-mixed-syntax/.cproject130
-rw-r--r--contrib/other-builds/extract-mixed-syntax/.project193
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Global.cpp37
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Global.h45
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Lattice.cpp180
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Lattice.h47
-rw-r--r--contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp149
-rw-r--r--contrib/other-builds/extract-mixed-syntax/LatticeNode.h77
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Makefile13
-rw-r--r--contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp79
-rw-r--r--contrib/other-builds/extract-mixed-syntax/OutputFileStream.h50
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Range.cpp74
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Range.h57
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Rule.cpp594
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Rule.h96
-rw-r--r--contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp102
-rw-r--r--contrib/other-builds/extract-mixed-syntax/RuleCollection.h55
-rw-r--r--contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp331
-rw-r--r--contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h69
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Symbol.cpp101
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Symbol.h36
-rw-r--r--contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp56
-rw-r--r--contrib/other-builds/extract-mixed-syntax/SymbolSequence.h42
-rw-r--r--contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp245
-rw-r--r--contrib/other-builds/extract-mixed-syntax/SyntaxTree.h96
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Tunnel.cpp38
-rw-r--r--contrib/other-builds/extract-mixed-syntax/Tunnel.h49
-rw-r--r--contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp70
-rw-r--r--contrib/other-builds/extract-mixed-syntax/TunnelCollection.h61
-rw-r--r--contrib/other-builds/extract-mixed-syntax/XmlTree.cpp344
-rw-r--r--contrib/other-builds/extract-mixed-syntax/extract.cpp310
-rw-r--r--contrib/other-builds/extract-mixed-syntax/extract.h34
-rw-r--r--contrib/other-builds/extract-mixed-syntax/tables-core.cpp110
-rw-r--r--contrib/other-builds/extract-mixed-syntax/tables-core.h72
-rw-r--r--contrib/other-builds/extract-ordering/.cproject134
-rw-r--r--contrib/other-builds/extract-rules/.cproject122
-rw-r--r--contrib/other-builds/extract-rules/.gitignore1
-rw-r--r--contrib/other-builds/extract-rules/.project15
-rw-r--r--contrib/other-builds/extract/.cproject115
-rw-r--r--contrib/other-builds/extractor/.cproject11
-rw-r--r--contrib/other-builds/extractor/.project1
-rw-r--r--contrib/other-builds/lm/.cproject20
-rw-r--r--contrib/other-builds/lm/.project1030
-rw-r--r--contrib/other-builds/manual-label/.cproject130
-rw-r--r--contrib/other-builds/manual-label/DeEn.cpp50
-rw-r--r--contrib/other-builds/manual-label/DeEn.h7
-rw-r--r--contrib/other-builds/manual-label/EnOpenNLPChunker.cpp201
-rw-r--r--contrib/other-builds/manual-label/EnOpenNLPChunker.h29
-rw-r--r--contrib/other-builds/manual-label/EnPhrasalVerb.cpp226
-rw-r--r--contrib/other-builds/manual-label/EnPhrasalVerb.h11
-rw-r--r--contrib/other-builds/manual-label/LabelByInitialLetter.cpp29
-rw-r--r--contrib/other-builds/manual-label/LabelByInitialLetter.h6
-rw-r--r--contrib/other-builds/manual-label/Main.cpp195
-rw-r--r--contrib/other-builds/manual-label/Main.h27
-rw-r--r--contrib/other-builds/manual-label/Makefile7
-rw-r--r--contrib/other-builds/manual-label/manual-label.cpp88
-rw-r--r--contrib/other-builds/mert_lib/.project20
-rw-r--r--contrib/other-builds/mira/.cproject176
-rw-r--r--contrib/other-builds/mira/.project81
-rw-r--r--contrib/other-builds/moses-chart-cmd.vcxproj115
-rw-r--r--contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj486
-rw-r--r--contrib/other-builds/moses-chart-cmd/.project135
-rw-r--r--contrib/other-builds/moses-cmd/.cproject34
-rw-r--r--contrib/other-builds/moses-cmd/.project40
-rw-r--r--contrib/other-builds/moses/.cproject158
-rw-r--r--contrib/other-builds/moses/.project2950
-rw-r--r--contrib/other-builds/score/.cproject145
-rw-r--r--contrib/other-builds/score/.project10
-rw-r--r--contrib/other-builds/server/.cproject169
-rw-r--r--contrib/other-builds/server/.project39
-rw-r--r--contrib/rephraser/Jamfile1
-rw-r--r--contrib/rephraser/paraphrase.cpp148
-rw-r--r--contrib/rt/Empty.c0
-rw-r--r--contrib/rt/README9
-rwxr-xr-xcontrib/rt/compile.sh2
-rw-r--r--contrib/server/Jamfile6
-rw-r--r--contrib/server/mosesserver.cpp1023
-rw-r--r--contrib/sigtest-filter/Makefile2
-rw-r--r--contrib/sigtest-filter/filter-pt.cpp429
-rw-r--r--contrib/tmcombine/test/model5/model/lex.counts.e2f8
-rw-r--r--contrib/tmcombine/test/model5/model/lex.counts.f2e8
-rw-r--r--contrib/tmcombine/test/model5/model/lex.e2f8
-rw-r--r--contrib/tmcombine/test/model5/model/lex.f2e8
-rw-r--r--contrib/tmcombine/test/model5/model/phrase-table8
-rw-r--r--contrib/tmcombine/test/model6/model/lex.counts.e2f8
-rw-r--r--contrib/tmcombine/test/model6/model/lex.counts.f2e8
-rw-r--r--contrib/tmcombine/test/model6/model/lex.e2f8
-rw-r--r--contrib/tmcombine/test/model6/model/lex.f2e8
-rw-r--r--contrib/tmcombine/test/model6/model/phrase-table5
-rw-r--r--contrib/tmcombine/test/phrase-table_test119
-rwxr-xr-xcontrib/tmcombine/tmcombine.py10
-rw-r--r--doc/PhraseDictionaryBitextSampling.howto4
-rw-r--r--jam-files/sanity.jam13
-rw-r--r--lm/Jamfile4
-rw-r--r--lm/builder/Jamfile4
-rw-r--r--lm/builder/adjust_counts.cc50
-rw-r--r--lm/builder/adjust_counts.hh28
-rw-r--r--lm/builder/adjust_counts_test.cc5
-rw-r--r--lm/builder/dump_counts_main.cc36
-rw-r--r--lm/builder/initial_probabilities.cc7
-rw-r--r--lm/builder/interpolate.cc104
-rw-r--r--lm/builder/interpolate.hh3
-rw-r--r--lm/builder/lmplz_main.cc36
-rw-r--r--lm/builder/pipeline.cc57
-rw-r--r--lm/builder/pipeline.hh9
-rw-r--r--lm/builder/print.cc7
-rw-r--r--lm/model_test.cc2
-rw-r--r--lm/ngram_query.hh2
-rw-r--r--lm/read_arpa.hh30
-rw-r--r--lm/test.arpa2
-rw-r--r--lm/test_nounk.arpa2
-rw-r--r--lm/trie_sort.cc14
-rw-r--r--lm/wrappers/README3
-rw-r--r--lm/wrappers/nplm.cc90
-rw-r--r--lm/wrappers/nplm.hh83
-rw-r--r--mert/BleuScorer.cpp23
-rw-r--r--mert/BleuScorer.h13
-rw-r--r--mert/BleuScorerTest.cpp2
-rw-r--r--mert/CderScorer.cpp10
-rw-r--r--mert/CderScorer.h6
-rw-r--r--mert/Data.cpp6
-rw-r--r--mert/Data.h2
-rw-r--r--mert/FeatureData.h5
-rw-r--r--mert/FeatureStats.cpp30
-rw-r--r--mert/FeatureStats.h5
-rw-r--r--mert/ForestRescore.cpp432
-rw-r--r--mert/ForestRescore.h120
-rw-r--r--mert/ForestRescoreTest.cpp246
-rw-r--r--mert/HopeFearDecoder.cpp343
-rw-r--r--mert/HopeFearDecoder.h160
-rw-r--r--mert/HwcmScorer.cpp165
-rw-r--r--mert/HwcmScorer.h64
-rw-r--r--mert/Hypergraph.cpp313
-rw-r--r--mert/Hypergraph.h251
-rw-r--r--mert/HypergraphTest.cpp151
-rw-r--r--mert/InterpolatedScorer.cpp35
-rw-r--r--mert/InterpolatedScorer.h4
-rw-r--r--mert/Jamfile9
-rw-r--r--mert/MeteorScorer.cpp4
-rw-r--r--mert/MeteorScorer.h2
-rw-r--r--mert/MiraFeatureVector.cpp40
-rw-r--r--mert/MiraFeatureVector.h8
-rw-r--r--mert/MiraWeightVector.cpp19
-rw-r--r--mert/MiraWeightVector.h8
-rw-r--r--mert/PerScorer.cpp4
-rw-r--r--mert/PerScorer.h2
-rw-r--r--mert/PermutationScorer.cpp2
-rw-r--r--mert/PermutationScorer.h2
-rw-r--r--mert/ScoreData.h3
-rw-r--r--mert/Scorer.h18
-rw-r--r--mert/ScorerFactory.cpp4
-rw-r--r--mert/SemposOverlapping.cpp16
-rw-r--r--mert/SemposOverlapping.h14
-rw-r--r--mert/SemposScorer.h2
-rw-r--r--mert/StatisticsBasedScorer.cpp2
-rw-r--r--mert/StatisticsBasedScorer.h11
-rw-r--r--mert/TER/alignmentStruct.cpp34
-rw-r--r--mert/TER/alignmentStruct.h53
-rw-r--r--mert/TER/bestShiftStruct.h48
-rw-r--r--mert/TER/hashMap.cpp250
-rw-r--r--mert/TER/hashMap.h69
-rw-r--r--mert/TER/hashMapInfos.cpp249
-rw-r--r--mert/TER/hashMapInfos.h69
-rw-r--r--mert/TER/hashMapStringInfos.cpp322
-rw-r--r--mert/TER/hashMapStringInfos.h69
-rw-r--r--mert/TER/infosHasher.cpp71
-rw-r--r--mert/TER/infosHasher.h57
-rw-r--r--mert/TER/stringHasher.cpp64
-rw-r--r--mert/TER/stringHasher.h58
-rw-r--r--mert/TER/stringInfosHasher.cpp71
-rw-r--r--mert/TER/stringInfosHasher.h60
-rw-r--r--mert/TER/terAlignment.cpp285
-rw-r--r--mert/TER/terAlignment.h83
-rw-r--r--mert/TER/terShift.cpp134
-rw-r--r--mert/TER/terShift.h75
-rw-r--r--mert/TER/tercalc.cpp1823
-rw-r--r--mert/TER/tercalc.h121
-rw-r--r--mert/TER/tools.cpp1085
-rw-r--r--mert/TER/tools.h117
-rw-r--r--mert/TerScorer.cpp2
-rw-r--r--mert/TerScorer.h2
-rw-r--r--mert/Types.h2
-rw-r--r--mert/evaluator.cpp61
-rw-r--r--mert/kbmira.cpp210
-rw-r--r--mira/Decoder.cpp6
-rw-r--r--mira/Jamfile2
-rw-r--r--mira/Main.cpp2
-rw-r--r--misc/CreateProbingPT.cpp20
-rw-r--r--misc/Jamfile43
-rw-r--r--misc/QueryProbingPT.cpp61
-rw-r--r--misc/merge-sorted.cc (renamed from contrib/m4m/util/merge-sorted.cc)0
-rw-r--r--misc/prunePhraseTable.cpp227
-rw-r--r--misc/queryPhraseTableMin.cpp6
-rw-r--r--moses-chart-cmd/IOWrapper.cpp1013
-rw-r--r--moses-chart-cmd/IOWrapper.h150
-rw-r--r--moses-chart-cmd/Jamfile2
-rw-r--r--moses-chart-cmd/Main.cpp365
-rw-r--r--moses-chart-cmd/Main.h45
-rw-r--r--moses-chart-cmd/TranslationAnalysis.cpp52
-rw-r--r--moses-chart-cmd/TranslationAnalysis.h24
-rw-r--r--moses-chart-cmd/mbr.cpp172
-rw-r--r--moses-cmd/IOWrapper.cpp679
-rw-r--r--moses-cmd/IOWrapper.h166
-rw-r--r--moses-cmd/Jamfile4
-rw-r--r--moses-cmd/LatticeMBRGrid.cpp20
-rw-r--r--moses-cmd/Main.cpp705
-rw-r--r--moses-cmd/Main.h5
-rw-r--r--moses/AlignmentInfo.cpp16
-rw-r--r--moses/AlignmentInfo.h11
-rw-r--r--moses/AlignmentInfoCollection.cpp12
-rw-r--r--moses/AlignmentInfoCollection.h12
-rw-r--r--moses/BaseManager.cpp111
-rw-r--r--moses/BaseManager.h75
-rw-r--r--moses/BitmapContainer.cpp50
-rw-r--r--moses/ChartCell.cpp9
-rw-r--r--moses/ChartCell.h3
-rw-r--r--moses/ChartCellCollection.h11
-rw-r--r--moses/ChartCellLabel.h1
-rw-r--r--moses/ChartCellLabelSet.h11
-rw-r--r--moses/ChartHypothesis.cpp42
-rw-r--r--moses/ChartHypothesis.h6
-rw-r--r--moses/ChartHypothesisCollection.cpp25
-rw-r--r--moses/ChartHypothesisCollection.h4
-rw-r--r--moses/ChartKBestExtractor.cpp29
-rw-r--r--moses/ChartKBestExtractor.h2
-rw-r--r--moses/ChartManager.cpp556
-rw-r--r--moses/ChartManager.h84
-rw-r--r--moses/ChartParser.cpp17
-rw-r--r--moses/ChartParserCallback.h2
-rw-r--r--moses/ChartTranslationOption.cpp8
-rw-r--r--moses/ChartTranslationOption.h6
-rw-r--r--moses/ChartTranslationOptionList.cpp4
-rw-r--r--moses/ChartTranslationOptionList.h2
-rw-r--r--moses/ChartTranslationOptions.cpp21
-rw-r--r--moses/ChartTranslationOptions.h2
-rw-r--r--moses/ConfusionNet.cpp508
-rw-r--r--moses/DecodeGraph.h14
-rw-r--r--moses/DecodeStepGeneration.cpp2
-rw-r--r--moses/DecodeStepTranslation.cpp17
-rw-r--r--moses/FF/BleuScoreFeature.cpp4
-rw-r--r--moses/FF/BleuScoreFeature.h8
-rw-r--r--moses/FF/ConstrainedDecoding.cpp4
-rw-r--r--moses/FF/ConstrainedDecoding.h12
-rw-r--r--moses/FF/ControlRecombination.cpp4
-rw-r--r--moses/FF/ControlRecombination.h8
-rw-r--r--moses/FF/CountNonTerms.cpp62
-rw-r--r--moses/FF/CountNonTerms.h8
-rw-r--r--moses/FF/CoveredReferenceFeature.cpp49
-rw-r--r--moses/FF/CoveredReferenceFeature.h8
-rw-r--r--moses/FF/DecodeFeature.cpp2
-rw-r--r--moses/FF/DecodeFeature.h11
-rw-r--r--moses/FF/DistortionScoreProducer.cpp2
-rw-r--r--moses/FF/DistortionScoreProducer.h8
-rw-r--r--moses/FF/DynamicCacheBasedLanguageModel.cpp2
-rw-r--r--moses/FF/DynamicCacheBasedLanguageModel.h27
-rw-r--r--moses/FF/ExternalFeature.cpp4
-rw-r--r--moses/FF/ExternalFeature.h9
-rw-r--r--moses/FF/Factory.cpp91
-rw-r--r--moses/FF/FeatureFunction.cpp12
-rw-r--r--moses/FF/FeatureFunction.h17
-rw-r--r--moses/FF/GlobalLexicalModel.cpp10
-rw-r--r--moses/FF/GlobalLexicalModel.h26
-rw-r--r--moses/FF/GlobalLexicalModelUnlimited.cpp2
-rw-r--r--moses/FF/GlobalLexicalModelUnlimited.h8
-rw-r--r--moses/FF/HyperParameterAsWeight.h8
-rw-r--r--moses/FF/InputFeature.cpp2
-rw-r--r--moses/FF/InputFeature.h9
-rw-r--r--moses/FF/InternalStructStatelessFF.cpp42
-rw-r--r--moses/FF/InternalTree.cpp230
-rw-r--r--moses/FF/InternalTree.h177
-rw-r--r--moses/FF/LexicalReordering/LexicalReordering.cpp37
-rw-r--r--moses/FF/LexicalReordering/LexicalReordering.h23
-rw-r--r--moses/FF/LexicalReordering/LexicalReorderingState.cpp140
-rw-r--r--moses/FF/LexicalReordering/LexicalReorderingState.h42
-rw-r--r--moses/FF/LexicalReordering/ReorderingStack.cpp (renamed from moses/ReorderingStack.cpp)0
-rw-r--r--moses/FF/LexicalReordering/ReorderingStack.h (renamed from moses/ReorderingStack.h)2
-rw-r--r--moses/FF/LexicalReordering/SparseReordering.cpp254
-rw-r--r--moses/FF/LexicalReordering/SparseReordering.h133
-rw-r--r--moses/FF/MaxSpanFreeNonTermSource.cpp54
-rw-r--r--moses/FF/MaxSpanFreeNonTermSource.h55
-rw-r--r--moses/FF/NieceTerminal.cpp94
-rw-r--r--moses/FF/NieceTerminal.h9
-rw-r--r--moses/FF/OSM-Feature/KenOSM.cpp32
-rw-r--r--moses/FF/OSM-Feature/KenOSM.h50
-rw-r--r--moses/FF/OSM-Feature/OpSequenceModel.cpp19
-rw-r--r--moses/FF/OSM-Feature/OpSequenceModel.h14
-rw-r--r--moses/FF/OSM-Feature/osmHyp.cpp4
-rw-r--r--moses/FF/OSM-Feature/osmHyp.h5
-rw-r--r--moses/FF/PhraseBoundaryFeature.cpp2
-rw-r--r--moses/FF/PhraseBoundaryFeature.h8
-rw-r--r--moses/FF/PhraseLengthFeature.cpp2
-rw-r--r--moses/FF/PhraseLengthFeature.h13
-rw-r--r--moses/FF/PhraseOrientationFeature.cpp617
-rw-r--r--moses/FF/PhraseOrientationFeature.h211
-rw-r--r--moses/FF/PhrasePairFeature.cpp46
-rw-r--r--moses/FF/PhrasePairFeature.h31
-rw-r--r--moses/FF/PhrasePenalty.cpp37
-rw-r--r--moses/FF/PhrasePenalty.h16
-rw-r--r--moses/FF/ReferenceComparison.h60
-rw-r--r--moses/FF/RuleScope.cpp37
-rw-r--r--moses/FF/RuleScope.h55
-rw-r--r--moses/FF/SetSourcePhrase.cpp12
-rw-r--r--moses/FF/SetSourcePhrase.h38
-rw-r--r--moses/FF/SkeletonChangeInput.cpp92
-rw-r--r--moses/FF/SkeletonChangeInput.h41
-rw-r--r--moses/FF/SkeletonStatefulFF.cpp23
-rw-r--r--moses/FF/SkeletonStatefulFF.h14
-rw-r--r--moses/FF/SkeletonStatelessFF.cpp35
-rw-r--r--moses/FF/SkeletonStatelessFF.h14
-rw-r--r--moses/FF/SoftMatchingFeature.cpp69
-rw-r--r--moses/FF/SoftMatchingFeature.h8
-rw-r--r--moses/FF/SoftSourceSyntacticConstraintsFeature.cpp564
-rw-r--r--moses/FF/SoftSourceSyntacticConstraintsFeature.h89
-rw-r--r--moses/FF/SourceGHKMTreeInputMatchFeature.cpp67
-rw-r--r--moses/FF/SourceGHKMTreeInputMatchFeature.h42
-rw-r--r--moses/FF/SourceWordDeletionFeature.cpp7
-rw-r--r--moses/FF/SourceWordDeletionFeature.h8
-rw-r--r--moses/FF/SpanLength.cpp93
-rw-r--r--moses/FF/SpanLength.h52
-rw-r--r--moses/FF/SparseHieroReorderingFeature.cpp222
-rw-r--r--moses/FF/SparseHieroReorderingFeature.h80
-rw-r--r--moses/FF/SparseHieroReorderingFeatureTest.cpp (renamed from moses-chart-cmd/mbr.h)27
-rw-r--r--moses/FF/StatefulFeatureFunction.h11
-rw-r--r--moses/FF/StatelessFeatureFunction.h10
-rw-r--r--moses/FF/SyntaxRHS.cpp46
-rw-r--r--moses/FF/SyntaxRHS.h (renamed from moses/FF/InternalStructStatelessFF.h)19
-rw-r--r--moses/FF/TargetBigramFeature.cpp2
-rw-r--r--moses/FF/TargetBigramFeature.h8
-rw-r--r--moses/FF/TargetNgramFeature.cpp4
-rw-r--r--moses/FF/TargetNgramFeature.h8
-rw-r--r--moses/FF/TargetWordInsertionFeature.cpp8
-rw-r--r--moses/FF/TargetWordInsertionFeature.h8
-rw-r--r--moses/FF/TreeStructureFeature.cpp291
-rw-r--r--moses/FF/TreeStructureFeature.h175
-rw-r--r--moses/FF/UnknownWordPenaltyProducer.h11
-rw-r--r--moses/FF/WordPenaltyProducer.cpp2
-rw-r--r--moses/FF/WordPenaltyProducer.h11
-rw-r--r--moses/FF/WordTranslationFeature.cpp46
-rw-r--r--moses/FF/WordTranslationFeature.h21
-rw-r--r--moses/FactorCollection.cpp17
-rw-r--r--moses/FactorCollection.h2
-rw-r--r--moses/FeatureVector.cpp4
-rw-r--r--moses/FeatureVector.h2
-rw-r--r--moses/HypergraphOutput.cpp252
-rw-r--r--moses/HypergraphOutput.h95
-rw-r--r--moses/Hypothesis.cpp42
-rw-r--r--moses/Hypothesis.h22
-rw-r--r--moses/IOWrapper.cpp909
-rw-r--r--moses/IOWrapper.h218
-rw-r--r--moses/Incremental.cpp197
-rw-r--r--moses/Incremental.h54
-rw-r--r--moses/InputPath.cpp5
-rw-r--r--moses/Jamfile25
-rw-r--r--moses/LM/Base.cpp2
-rw-r--r--moses/LM/Base.h4
-rw-r--r--moses/LM/BilingualLM.cpp470
-rw-r--r--moses/LM/BilingualLM.h142
-rw-r--r--moses/LM/DALMWrapper.cpp746
-rw-r--r--moses/LM/DALMWrapper.h4
-rw-r--r--moses/LM/IRST.cpp1
-rw-r--r--moses/LM/Implementation.cpp14
-rw-r--r--moses/LM/Implementation.h4
-rw-r--r--moses/LM/Jamfile32
-rw-r--r--moses/LM/Ken.cpp60
-rw-r--r--moses/LM/Ken.h6
-rw-r--r--moses/LM/LDHT.cpp4
-rw-r--r--moses/LM/NeuralLMWrapper.cpp64
-rw-r--r--moses/LM/NeuralLMWrapper.h16
-rw-r--r--moses/LM/SingleFactor.cpp11
-rw-r--r--moses/LM/SingleFactor.h2
-rw-r--r--moses/LM/bilingual-lm/BiLM_NPLM.cpp138
-rw-r--r--moses/LM/bilingual-lm/BiLM_NPLM.h49
-rw-r--r--moses/LM/oxlm/OxLM.cpp209
-rw-r--r--moses/LM/oxlm/OxLM.h60
-rw-r--r--moses/LM/oxlm/OxLMMapper.cpp47
-rw-r--r--moses/LM/oxlm/OxLMMapper.h35
-rw-r--r--moses/LM/oxlm/OxLMParallelMapper.cpp40
-rw-r--r--moses/LM/oxlm/OxLMParallelMapper.h21
-rw-r--r--moses/LM/oxlm/SourceOxLM.cpp137
-rw-r--r--moses/LM/oxlm/SourceOxLM.h49
-rw-r--r--moses/LatticeMBR.cpp (renamed from moses-cmd/LatticeMBR.cpp)3
-rw-r--r--moses/LatticeMBR.h (renamed from moses-cmd/LatticeMBR.h)2
-rw-r--r--moses/Manager.cpp492
-rw-r--r--moses/Manager.h44
-rw-r--r--moses/MockHypothesis.cpp4
-rw-r--r--moses/PDTAimp.cpp462
-rw-r--r--moses/PDTAimp.h468
-rw-r--r--moses/PP/CountsPhraseProperty.cpp38
-rw-r--r--moses/PP/CountsPhraseProperty.h62
-rw-r--r--moses/PP/Factory.cpp15
-rw-r--r--moses/PP/NonTermContextProperty.cpp137
-rw-r--r--moses/PP/NonTermContextProperty.h73
-rw-r--r--moses/PP/OrientationPhraseProperty.cpp26
-rw-r--r--moses/PP/OrientationPhraseProperty.h73
-rw-r--r--moses/PP/PhraseProperty.cpp13
-rw-r--r--moses/PP/PhraseProperty.h13
-rw-r--r--moses/PP/SourceLabelsPhraseProperty.cpp124
-rw-r--r--moses/PP/SourceLabelsPhraseProperty.h77
-rw-r--r--moses/PP/SpanLengthPhraseProperty.cpp127
-rw-r--r--moses/PP/SpanLengthPhraseProperty.h35
-rw-r--r--moses/PP/TreeStructurePhraseProperty.h2
-rw-r--r--moses/Parameter.cpp210
-rw-r--r--moses/Parameter.h39
-rw-r--r--moses/Phrase.h4
-rw-r--r--moses/PrefixTree.h3
-rw-r--r--moses/RuleCubeItem.cpp2
-rw-r--r--moses/ScoreComponentCollection.cpp16
-rw-r--r--moses/ScoreComponentCollection.h24
-rw-r--r--moses/ScoreComponentCollectionTest.cpp8
-rw-r--r--moses/SearchCubePruning.cpp7
-rw-r--r--moses/SearchNormal.cpp20
-rw-r--r--moses/SearchNormal.h2
-rw-r--r--moses/SearchNormalBatch.cpp6
-rw-r--r--moses/StaticData.cpp518
-rw-r--r--moses/StaticData.h41
-rw-r--r--moses/SyntacticLanguageModel.h2
-rw-r--r--moses/Syntax/BoundedPriorityContainer.h164
-rw-r--r--moses/Syntax/Cube.cpp138
-rw-r--r--moses/Syntax/Cube.h58
-rw-r--r--moses/Syntax/CubeQueue.cpp37
-rw-r--r--moses/Syntax/CubeQueue.h50
-rw-r--r--moses/Syntax/KBestExtractor.cpp317
-rw-r--r--moses/Syntax/KBestExtractor.h118
-rw-r--r--moses/Syntax/NonTerminalMap.h71
-rw-r--r--moses/Syntax/PHyperedge.h22
-rw-r--r--moses/Syntax/PVertex.h21
-rw-r--r--moses/Syntax/RuleTable.h24
-rw-r--r--moses/Syntax/RuleTableFF.cpp51
-rw-r--r--moses/Syntax/RuleTableFF.h50
-rw-r--r--moses/Syntax/S2T/DerivationWriter.cpp100
-rw-r--r--moses/Syntax/S2T/DerivationWriter.h38
-rw-r--r--moses/Syntax/S2T/Manager-inl.h599
-rw-r--r--moses/Syntax/S2T/Manager.h97
-rw-r--r--moses/Syntax/S2T/OovHandler-inl.h107
-rw-r--r--moses/Syntax/S2T/OovHandler.h49
-rw-r--r--moses/Syntax/S2T/PChart.cpp34
-rw-r--r--moses/Syntax/S2T/PChart.h89
-rw-r--r--moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h44
-rw-r--r--moses/Syntax/S2T/ParserCallback.h83
-rw-r--r--moses/Syntax/S2T/Parsers/Parser.h30
-rw-r--r--moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h164
-rw-r--r--moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h61
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h185
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h71
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp190
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h65
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h27
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h21
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp160
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h41
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h32
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp131
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h41
-rw-r--r--moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h95
-rw-r--r--moses/Syntax/S2T/RuleTrie.h40
-rw-r--r--moses/Syntax/S2T/RuleTrieCYKPlus.cpp151
-rw-r--r--moses/Syntax/S2T/RuleTrieCYKPlus.h89
-rw-r--r--moses/Syntax/S2T/RuleTrieCreator.h33
-rw-r--r--moses/Syntax/S2T/RuleTrieLoader.cpp156
-rw-r--r--moses/Syntax/S2T/RuleTrieLoader.h31
-rw-r--r--moses/Syntax/S2T/RuleTrieScope3.cpp153
-rw-r--r--moses/Syntax/S2T/RuleTrieScope3.h106
-rw-r--r--moses/Syntax/S2T/SChart.cpp20
-rw-r--r--moses/Syntax/S2T/SChart.h50
-rw-r--r--moses/Syntax/SHyperedge.cpp59
-rw-r--r--moses/Syntax/SHyperedge.h28
-rw-r--r--moses/Syntax/SHyperedgeBundle.h30
-rw-r--r--moses/Syntax/SHyperedgeBundleScorer.h28
-rw-r--r--moses/Syntax/SVertex.cpp28
-rw-r--r--moses/Syntax/SVertex.h31
-rw-r--r--moses/Syntax/SVertexRecombinationOrderer.h38
-rw-r--r--moses/Syntax/SVertexStack.h28
-rw-r--r--moses/Syntax/SymbolEqualityPred.h24
-rw-r--r--moses/Syntax/SymbolHasher.h25
-rw-r--r--moses/TargetPhrase.cpp67
-rw-r--r--moses/TargetPhrase.h45
-rw-r--r--moses/TargetPhraseCollection.h6
-rw-r--r--moses/ThreadPool.cpp5
-rw-r--r--moses/Timer.cpp11
-rw-r--r--moses/TranslationAnalysis.cpp (renamed from moses-cmd/TranslationAnalysis.cpp)30
-rw-r--r--moses/TranslationAnalysis.h (renamed from moses-cmd/TranslationAnalysis.h)12
-rw-r--r--moses/TranslationModel/BilingualDynSuffixArray.cpp10
-rw-r--r--moses/TranslationModel/BilingualDynSuffixArray.h2
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h13
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp273
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h17
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp273
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h15
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp23
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h4
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp2
-rw-r--r--moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.cpp43
-rw-r--r--moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h6
-rw-r--r--moses/TranslationModel/CompactPT/MurmurHash3.cpp2
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDecoder.cpp2
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp2
-rw-r--r--moses/TranslationModel/CompactPT/PhraseTableCreator.cpp68
-rw-r--r--moses/TranslationModel/CompactPT/StringVector.h21
-rw-r--r--moses/TranslationModel/DynSAInclude/FileHandler.cpp8
-rw-r--r--moses/TranslationModel/DynSAInclude/params.cpp3
-rw-r--r--moses/TranslationModel/PhraseDictionary.cpp92
-rw-r--r--moses/TranslationModel/PhraseDictionary.h5
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp4
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp13
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h2
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModel.cpp140
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModel.h9
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp25
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModelCounts.h2
-rw-r--r--moses/TranslationModel/PhraseDictionaryTransliteration.cpp169
-rw-r--r--moses/TranslationModel/PhraseDictionaryTree.cpp33
-rw-r--r--moses/TranslationModel/ProbingPT/Jamfile13
-rw-r--r--moses/TranslationModel/ProbingPT/ProbingPT.cpp231
-rw-r--r--moses/TranslationModel/ProbingPT/ProbingPT.h59
-rw-r--r--moses/TranslationModel/ProbingPT/hash.cpp27
-rw-r--r--moses/TranslationModel/ProbingPT/hash.hh14
-rw-r--r--moses/TranslationModel/ProbingPT/huffmanish.cpp414
-rw-r--r--moses/TranslationModel/ProbingPT/huffmanish.hh110
-rw-r--r--moses/TranslationModel/ProbingPT/line_splitter.cpp52
-rw-r--r--moses/TranslationModel/ProbingPT/line_splitter.hh31
-rw-r--r--moses/TranslationModel/ProbingPT/probing_hash_utils.cpp32
-rw-r--r--moses/TranslationModel/ProbingPT/probing_hash_utils.hh37
-rw-r--r--moses/TranslationModel/ProbingPT/quering.cpp174
-rw-r--r--moses/TranslationModel/ProbingPT/quering.hh39
-rw-r--r--moses/TranslationModel/ProbingPT/storing.cpp151
-rw-r--r--moses/TranslationModel/ProbingPT/storing.hh33
-rw-r--r--moses/TranslationModel/ProbingPT/tests/tokenization_tests.cpp198
-rw-r--r--moses/TranslationModel/ProbingPT/tests/vocabid_test.cpp45
-rw-r--r--moses/TranslationModel/ProbingPT/vocabid.cpp29
-rw-r--r--moses/TranslationModel/ProbingPT/vocabid.hh20
-rw-r--r--moses/TranslationModel/RuleTable/LoaderCompact.cpp4
-rw-r--r--moses/TranslationModel/RuleTable/LoaderStandard.cpp9
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp4
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp7
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp18
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h9
-rw-r--r--moses/TranslationModel/SkeletonPT.cpp4
-rw-r--r--moses/TranslationModel/UG/Jamfile101
-rw-r--r--moses/TranslationModel/UG/count-ptable-features.cc26
-rw-r--r--moses/TranslationModel/UG/generic/Jamfile3
-rw-r--r--moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc50
-rw-r--r--moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h18
-rw-r--r--moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc434
-rw-r--r--moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h87
-rw-r--r--moses/TranslationModel/UG/mm/Jamfile30
-rw-r--r--moses/TranslationModel/UG/mm/custom-pt.cc17
-rw-r--r--moses/TranslationModel/UG/mm/mmlex-lookup.cc2
-rw-r--r--moses/TranslationModel/UG/mm/mtt-build.cc2
-rw-r--r--moses/TranslationModel/UG/mm/mtt-demo1.cc54
-rw-r--r--moses/TranslationModel/UG/mm/num_read_write.cc74
-rw-r--r--moses/TranslationModel/UG/mm/num_read_write.h124
-rw-r--r--moses/TranslationModel/UG/mm/tpt_pickler.cc23
-rw-r--r--moses/TranslationModel/UG/mm/tpt_pickler.h19
-rw-r--r--moses/TranslationModel/UG/mm/tpt_tightindex.h11
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.cc203
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.h911
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_tsa.h16
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_ttrack.h44
-rw-r--r--moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h71
-rw-r--r--moses/TranslationModel/UG/mm/ug_lru_cache.h106
-rw-r--r--moses/TranslationModel/UG/mm/ug_mm_2d_table.h23
-rw-r--r--moses/TranslationModel/UG/mm/ug_mm_tsa.h10
-rw-r--r--moses/TranslationModel/UG/mm/ug_mm_ttrack.h2
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.cc97
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.h13
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_base.h40
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h2
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h76
-rw-r--r--moses/TranslationModel/UG/mmsapt.cpp899
-rw-r--r--moses/TranslationModel/UG/mmsapt.h148
-rw-r--r--moses/TranslationModel/UG/mmsapt_align.cc608
-rw-r--r--moses/TranslationModel/UG/ptable-describe-features.cc37
-rw-r--r--moses/TranslationModel/UG/ptable-lookup.cc123
-rw-r--r--moses/TranslationModel/UG/sapt_phrase_key.h13
-rw-r--r--moses/TranslationModel/UG/sapt_phrase_scorers.h14
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_base.h103
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_coherence.h33
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_lex1.h70
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_logcnt.h65
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_pbwd.h58
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_pfwd.h70
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_phrasecount.h34
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_provenance.h47
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_rareness.h41
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_unaligned.h67
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_wordcount.h34
-rw-r--r--moses/TranslationModel/UG/sim-pe.cc84
-rw-r--r--moses/TranslationModel/UG/spe-check-coverage.cc214
-rw-r--r--moses/TranslationModel/UG/spe-check-coverage2.cc76
-rw-r--r--moses/TranslationModel/UG/spe-check-coverage3.cc194
-rw-r--r--moses/TranslationModel/UG/try-align.cc521
-rw-r--r--moses/TranslationModel/UG/try-align2.cc886
-rw-r--r--moses/TranslationModel/UG/util/Makefile7
-rwxr-xr-xmoses/TranslationModel/UG/util/ibm1-alignbin0 -> 1062799 bytes
-rw-r--r--moses/TranslationModel/UG/util/ibm1-align.cc164
-rw-r--r--moses/TranslationModel/UG/util/tokenindex.dump.cc31
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp39
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h4
-rw-r--r--moses/TranslationModel/fuzzy-match/SuffixArray.cpp15
-rw-r--r--moses/TranslationModel/fuzzy-match/Vocabulary.h14
-rw-r--r--moses/TranslationModel/fuzzy-match/create_xml.cpp36
-rw-r--r--moses/TranslationOption.cpp6
-rw-r--r--moses/TranslationOption.h2
-rw-r--r--moses/TranslationOptionCollection.cpp16
-rw-r--r--moses/TranslationOptionCollection.h2
-rw-r--r--moses/TranslationOptionCollectionLattice.cpp110
-rw-r--r--moses/TranslationTask.cpp318
-rw-r--r--moses/TranslationTask.h69
-rw-r--r--moses/TreeInput.cpp24
-rw-r--r--moses/TypeDef.h34
-rw-r--r--moses/Util.cpp41
-rw-r--r--moses/Util.h69
-rw-r--r--moses/Word.cpp99
-rw-r--r--moses/WordLattice.cpp2
-rw-r--r--moses/XmlOption.cpp102
-rw-r--r--moses/mbr.cpp (renamed from moses-cmd/mbr.cpp)0
-rw-r--r--moses/mbr.h (renamed from moses-cmd/mbr.h)0
-rw-r--r--phrase-extract/DomainFeature.cpp12
-rw-r--r--phrase-extract/ExtractionPhrasePair.cpp234
-rw-r--r--phrase-extract/ExtractionPhrasePair.h13
-rw-r--r--phrase-extract/Jamfile2
-rw-r--r--phrase-extract/PhraseExtractionOptions.h1
-rw-r--r--phrase-extract/PropertiesConsolidator.cpp159
-rw-r--r--phrase-extract/PropertiesConsolidator.h (renamed from contrib/other-builds/extract-mixed-syntax/XmlTree.h)83
-rw-r--r--phrase-extract/SafeGetline.h35
-rw-r--r--phrase-extract/ScoreFeature.h2
-rw-r--r--phrase-extract/ScoreFeatureTest.cpp20
-rw-r--r--phrase-extract/SentenceAlignment.cpp6
-rw-r--r--phrase-extract/SentenceAlignment.h7
-rw-r--r--phrase-extract/SyntaxTree.h7
-rw-r--r--phrase-extract/XmlTree.cpp9
-rw-r--r--phrase-extract/consolidate-direct-main.cpp24
-rw-r--r--phrase-extract/consolidate-main.cpp61
-rw-r--r--phrase-extract/consolidate-reverse-main.cpp23
-rw-r--r--phrase-extract/extract-ghkm/Alignment.cpp8
-rw-r--r--phrase-extract/extract-ghkm/Alignment.h5
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.cpp384
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.h10
-rw-r--r--phrase-extract/extract-ghkm/Options.h24
-rw-r--r--phrase-extract/extract-ghkm/ParseTree.h4
-rw-r--r--phrase-extract/extract-ghkm/PhraseOrientation.cpp433
-rw-r--r--phrase-extract/extract-ghkm/PhraseOrientation.h106
-rw-r--r--phrase-extract/extract-ghkm/Rule.cpp41
-rw-r--r--phrase-extract/extract-ghkm/Rule.h58
-rw-r--r--phrase-extract/extract-ghkm/ScfgRule.cpp97
-rw-r--r--phrase-extract/extract-ghkm/ScfgRule.h54
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.cpp37
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.h4
-rw-r--r--phrase-extract/extract-ghkm/StsgRule.cpp95
-rw-r--r--phrase-extract/extract-ghkm/StsgRule.h44
-rw-r--r--phrase-extract/extract-ghkm/StsgRuleWriter.cpp95
-rw-r--r--phrase-extract/extract-ghkm/StsgRuleWriter.h41
-rw-r--r--phrase-extract/extract-ghkm/Subgraph.h36
-rw-r--r--phrase-extract/extract-ghkm/XmlTreeParser.h8
-rw-r--r--phrase-extract/extract-main.cpp34
-rw-r--r--phrase-extract/extract-mixed-syntax/AlignedSentence.cpp194
-rw-r--r--phrase-extract/extract-mixed-syntax/AlignedSentence.h51
-rw-r--r--phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp183
-rw-r--r--phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.h46
-rw-r--r--phrase-extract/extract-mixed-syntax/ConsistentPhrase.cpp66
-rw-r--r--phrase-extract/extract-mixed-syntax/ConsistentPhrase.h51
-rw-r--r--phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp103
-rw-r--r--phrase-extract/extract-mixed-syntax/ConsistentPhrases.h40
-rw-r--r--phrase-extract/extract-mixed-syntax/InputFileStream.cpp (renamed from contrib/other-builds/extract-mixed-syntax/InputFileStream.cpp)0
-rw-r--r--phrase-extract/extract-mixed-syntax/InputFileStream.h (renamed from contrib/other-builds/extract-mixed-syntax/InputFileStream.h)0
-rw-r--r--phrase-extract/extract-mixed-syntax/Jamfile2
-rw-r--r--phrase-extract/extract-mixed-syntax/Main.cpp208
-rw-r--r--phrase-extract/extract-mixed-syntax/Main.h12
-rw-r--r--phrase-extract/extract-mixed-syntax/NonTerm.cpp66
-rw-r--r--phrase-extract/extract-mixed-syntax/NonTerm.h47
-rw-r--r--phrase-extract/extract-mixed-syntax/Parameter.cpp72
-rw-r--r--phrase-extract/extract-mixed-syntax/Parameter.h65
-rw-r--r--phrase-extract/extract-mixed-syntax/Phrase.cpp14
-rw-r--r--phrase-extract/extract-mixed-syntax/Phrase.h19
-rw-r--r--phrase-extract/extract-mixed-syntax/Rule.cpp662
-rw-r--r--phrase-extract/extract-mixed-syntax/Rule.h90
-rw-r--r--phrase-extract/extract-mixed-syntax/RulePhrase.cpp50
-rw-r--r--phrase-extract/extract-mixed-syntax/RulePhrase.h49
-rw-r--r--phrase-extract/extract-mixed-syntax/RuleSymbol.cpp36
-rw-r--r--phrase-extract/extract-mixed-syntax/RuleSymbol.h31
-rw-r--r--phrase-extract/extract-mixed-syntax/Rules.cpp227
-rw-r--r--phrase-extract/extract-mixed-syntax/Rules.h72
-rw-r--r--phrase-extract/extract-mixed-syntax/SyntaxTree.cpp47
-rw-r--r--phrase-extract/extract-mixed-syntax/SyntaxTree.h32
-rw-r--r--phrase-extract/extract-mixed-syntax/Word.cpp68
-rw-r--r--phrase-extract/extract-mixed-syntax/Word.h49
-rw-r--r--phrase-extract/extract-mixed-syntax/gzfilebuf.h (renamed from contrib/other-builds/extract-mixed-syntax/gzfilebuf.h)0
-rw-r--r--phrase-extract/extract-mixed-syntax/pugiconfig.hpp69
-rw-r--r--phrase-extract/extract-mixed-syntax/pugixml.cpp10250
-rw-r--r--phrase-extract/extract-mixed-syntax/pugixml.hpp1265
-rw-r--r--phrase-extract/extract-ordering-main.cpp687
-rw-r--r--phrase-extract/extract-rules-main.cpp34
-rw-r--r--phrase-extract/filter-rule-table/FilterRuleTable.cpp161
-rw-r--r--phrase-extract/filter-rule-table/FilterRuleTable.h48
-rw-r--r--phrase-extract/filter-rule-table/Jamfile1
-rw-r--r--phrase-extract/filter-rule-table/Main.cpp7
-rw-r--r--phrase-extract/filter-rule-table/Options.h22
-rw-r--r--phrase-extract/filter-rule-table/StringBasedFilter.cpp27
-rw-r--r--phrase-extract/filter-rule-table/StringBasedFilter.h24
-rw-r--r--phrase-extract/filter-rule-table/TreeBasedFilter.cpp243
-rw-r--r--phrase-extract/filter-rule-table/TreeBasedFilter.h87
-rw-r--r--phrase-extract/pcfg-common/Jamfile2
-rw-r--r--phrase-extract/pcfg-common/exception.h46
-rw-r--r--phrase-extract/pcfg-common/numbered_set.h126
-rw-r--r--phrase-extract/pcfg-common/pcfg.cc10
-rw-r--r--phrase-extract/pcfg-common/pcfg.h37
-rw-r--r--phrase-extract/pcfg-common/pcfg_tree.h39
-rw-r--r--phrase-extract/pcfg-common/syntax_tree.h65
-rw-r--r--phrase-extract/pcfg-common/tool.cc6
-rw-r--r--phrase-extract/pcfg-common/tool.h27
-rw-r--r--phrase-extract/pcfg-common/typedef.h16
-rw-r--r--phrase-extract/pcfg-common/xml_tree_parser.cc24
-rw-r--r--phrase-extract/pcfg-common/xml_tree_parser.h23
-rw-r--r--phrase-extract/pcfg-common/xml_tree_writer.h36
-rw-r--r--phrase-extract/pcfg-extract/main.cc2
-rw-r--r--phrase-extract/pcfg-extract/options.h10
-rw-r--r--phrase-extract/pcfg-extract/pcfg_extract.cc9
-rw-r--r--phrase-extract/pcfg-extract/pcfg_extract.h15
-rw-r--r--phrase-extract/pcfg-extract/rule_collection.cc6
-rw-r--r--phrase-extract/pcfg-extract/rule_collection.h37
-rw-r--r--phrase-extract/pcfg-extract/rule_extractor.cc6
-rw-r--r--phrase-extract/pcfg-extract/rule_extractor.h21
-rw-r--r--phrase-extract/pcfg-score/main.cc2
-rw-r--r--phrase-extract/pcfg-score/options.h10
-rw-r--r--phrase-extract/pcfg-score/pcfg_score.cc30
-rw-r--r--phrase-extract/pcfg-score/pcfg_score.h19
-rw-r--r--phrase-extract/pcfg-score/tree_scorer.cc6
-rw-r--r--phrase-extract/pcfg-score/tree_scorer.h19
-rw-r--r--phrase-extract/relax-parse-main.cpp14
-rw-r--r--phrase-extract/score-main.cpp495
-rw-r--r--phrase-extract/score-stsg/Jamfile1
-rw-r--r--phrase-extract/score-stsg/LexicalTable.cpp56
-rw-r--r--phrase-extract/score-stsg/LexicalTable.h46
-rw-r--r--phrase-extract/score-stsg/Main.cpp7
-rw-r--r--phrase-extract/score-stsg/Options.h44
-rw-r--r--phrase-extract/score-stsg/RuleGroup.cpp45
-rw-r--r--phrase-extract/score-stsg/RuleGroup.h71
-rw-r--r--phrase-extract/score-stsg/RuleSymbol.h20
-rw-r--r--phrase-extract/score-stsg/RuleTableWriter.cpp80
-rw-r--r--phrase-extract/score-stsg/RuleTableWriter.h44
-rw-r--r--phrase-extract/score-stsg/ScoreStsg.cpp445
-rw-r--r--phrase-extract/score-stsg/ScoreStsg.h76
-rw-r--r--phrase-extract/score-stsg/TokenizedRuleHalf.cpp40
-rw-r--r--phrase-extract/score-stsg/TokenizedRuleHalf.h49
-rw-r--r--phrase-extract/score-stsg/Vocabulary.h15
-rw-r--r--phrase-extract/score.h28
-rw-r--r--phrase-extract/statistics-main.cpp29
-rw-r--r--phrase-extract/syntax-common/Jamfile8
-rw-r--r--phrase-extract/syntax-common/exception.h20
-rw-r--r--phrase-extract/syntax-common/numbered_set.h110
-rw-r--r--phrase-extract/syntax-common/string_tree.h13
-rw-r--r--phrase-extract/syntax-common/tree-inl.h115
-rw-r--r--phrase-extract/syntax-common/tree.h91
-rw-r--r--phrase-extract/syntax-common/tree_fragment_tokenizer.cc90
-rw-r--r--phrase-extract/syntax-common/tree_fragment_tokenizer.h69
-rw-r--r--phrase-extract/syntax-common/tree_fragment_tokenizer_test.cc74
-rw-r--r--phrase-extract/syntax-common/tree_test.cc66
-rw-r--r--phrase-extract/syntax-common/xml_tree_parser.cc59
-rw-r--r--phrase-extract/syntax-common/xml_tree_parser.h34
-rw-r--r--phrase-extract/tables-core.h2
-rw-r--r--regression-testing/Jamfile2
-rwxr-xr-xscripts/OSM/OSM-Train.perl19
-rwxr-xr-xscripts/Transliteration/post-decoding-transliteration.pl9
-rwxr-xr-xscripts/Transliteration/train-transliteration-module.pl12
-rwxr-xr-xscripts/analysis/oov.pl61
-rw-r--r--scripts/ems/example/config.basic20
-rw-r--r--scripts/ems/example/config.factored20
-rw-r--r--scripts/ems/example/config.hierarchical20
-rw-r--r--scripts/ems/example/config.syntax20
-rw-r--r--scripts/ems/example/config.toy20
-rw-r--r--scripts/ems/experiment.meta147
-rwxr-xr-xscripts/ems/experiment.perl151
-rwxr-xr-xscripts/ems/support/analysis.perl118
-rwxr-xr-xscripts/ems/support/build-sparse-features.perl8
-rw-r--r--scripts/ems/support/defaultconfig.py2
-rwxr-xr-xscripts/ems/support/interpolate-lm.perl72
-rwxr-xr-xscripts/ems/support/mml-filter.py2
-rwxr-xr-xscripts/ems/support/substitute-filtered-tables.perl4
-rwxr-xr-xscripts/ems/support/thot-lm-wrapper.perl20
-rwxr-xr-xscripts/generic/extract-parallel.perl55
-rwxr-xr-xscripts/generic/fsa2fsal.pl49
-rwxr-xr-xscripts/generic/fsal2fsa.pl15
-rwxr-xr-xscripts/generic/generic-parallel.perl16
-rwxr-xr-xscripts/generic/giza-parallel.perl42
-rwxr-xr-xscripts/generic/moses_sim_pe.py373
-rwxr-xr-xscripts/generic/score-parallel.perl32
-rwxr-xr-xscripts/other/beautify.perl2
-rwxr-xr-xscripts/other/blame-stat.sh4
-rwxr-xr-xscripts/other/delete-scores.perl2
-rwxr-xr-xscripts/other/retain-lines.perl31
-rwxr-xr-xscripts/recaser/train-recaser.perl2
-rw-r--r--scripts/server/moses.py19
-rwxr-xr-xscripts/server/sim-pe.py57
-rw-r--r--scripts/share/nonbreaking_prefixes/README.txt3
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en14
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fi138
-rw-r--r--scripts/tokenizer/basic-protected-patterns5
-rwxr-xr-xscripts/tokenizer/deescape-special-chars-PTB.perl19
-rwxr-xr-xscripts/tokenizer/normalize-punctuation.perl8
-rwxr-xr-xscripts/tokenizer/tokenizer.perl11
-rwxr-xr-xscripts/tokenizer/tokenizer_PTB.perl399
-rw-r--r--scripts/training/bilingual-lm/README9
-rwxr-xr-xscripts/training/bilingual-lm/averageNullEmbedding_baseline.py44
-rwxr-xr-xscripts/training/bilingual-lm/extract.py122
-rwxr-xr-xscripts/training/bilingual-lm/extract_test.py73
-rwxr-xr-xscripts/training/bilingual-lm/extract_training.py167
-rwxr-xr-xscripts/training/bilingual-lm/reduce_ngrams.py28
-rwxr-xr-xscripts/training/bilingual-lm/tag.sh17
-rwxr-xr-xscripts/training/bilingual-lm/test_nplm.py57
-rwxr-xr-xscripts/training/bilingual-lm/train_nplm.py121
-rwxr-xr-xscripts/training/build-mmsapt.perl23
-rwxr-xr-xscripts/training/filter-model-given-input.pl3
-rwxr-xr-xscripts/training/mert-moses.pl76
-rwxr-xr-xscripts/training/train-model.perl97
-rwxr-xr-xscripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl54
-rwxr-xr-xscripts/training/wrappers/conll2mosesxml.py183
-rwxr-xr-xscripts/training/wrappers/make-factor-brown-cluster-mkcls.perl10
-rwxr-xr-xscripts/training/wrappers/parse-de-berkeley.perl41
-rwxr-xr-xscripts/training/wrappers/parse-de-bitpar.perl10
-rwxr-xr-xscripts/training/wrappers/tagger-german-chunk.perl144
-rw-r--r--search/edge_generator.hh2
-rw-r--r--search/types.hh2
-rw-r--r--search/vertex_generator.hh2
-rw-r--r--util/exception.hh8
-rw-r--r--util/file.cc6
-rw-r--r--util/file.hh26
-rw-r--r--util/read_compressed.cc12
-rw-r--r--util/read_compressed.hh4
-rw-r--r--util/scoped.cc4
-rw-r--r--util/scoped.hh123
-rw-r--r--util/usage.hh4
867 files changed, 62217 insertions, 20225 deletions
diff --git a/.gitignore b/.gitignore
index f870bed03..9c82eb9f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+tools
*.d
*.pyc
*.lo
@@ -79,3 +80,4 @@ nbproject/
mingw/MosesGUI/MosesGUI.e4p
mingw/MosesGUI/_eric4project/
+contrib/m4m/merge-sorted
diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt
index 692df8616..d3983fd18 100644
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@@ -1,3 +1,4 @@
Please see the Moses website on how to compile and run Moses
http://www.statmt.org/moses/?n=Development.GetStarted
+
diff --git a/Jamroot b/Jamroot
index 1f7ca48cd..4514ecbb3 100644
--- a/Jamroot
+++ b/Jamroot
@@ -70,6 +70,7 @@
#-a to build from scratch
#-j$NCPUS to compile in parallel
#--clean to clean
+#--debug-build to build with Og. Only available with gcc 4.8+
import option ;
import modules ;
@@ -77,7 +78,7 @@ import path ;
path-constant TOP : . ;
include $(TOP)/jam-files/sanity.jam ;
-boost 103600 ;
+boost 104400 ;
external-lib z ;
lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
@@ -97,6 +98,11 @@ if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_min
echo "Tip: install tcmalloc for faster threading. See BUILD-INSTRUCTIONS.txt for more information." ;
}
+if [ option.get "debug-build" : : "yes" ] {
+ requirements += <cxxflags>-Og ;
+ echo "Building with -Og to enable easier profiling and debugging. Only available on gcc 4.8+." ;
+}
+
if [ option.get "enable-mpi" : : "yes" ] {
import mpi ;
using mpi ;
@@ -114,10 +120,37 @@ requirements += [ option.get "with-mm" : : <define>PT_UG ] ;
requirements += [ option.get "with-mm" : : <define>MAX_NUM_FACTORS=4 ] ;
requirements += [ option.get "unlabelled-source" : : <define>UNLABELLED_SOURCE ] ;
-if [ option.get "with-cmph" ] {
+if [ option.get "with-oxlm" ] {
+ external-lib boost_serialization ;
+ external-lib gomp ;
+ requirements += <library>boost_serialization ;
+ requirements += <library>gomp ;
+}
+
+if [ option.get "with-cmph" : : "yes" ] {
requirements += <define>HAVE_CMPH ;
}
+if [ option.get "with-icu" : : "yes" ]
+{
+ external-lib icuuc ;
+ external-lib icuio ;
+ external-lib icui18n ;
+ requirements += <library>icuuc/<link>shared ;
+ requirements += <library>icuio/<link>shared ;
+ requirements += <library>icui18n/<link>shared ;
+ requirements += <cxxflags>-fPIC ;
+ requirements += <address-model>64 ;
+ requirements += <runtime-link>shared ;
+}
+
+if [ option.get "with-probing-pt" : : "yes" ]
+{
+ external-lib boost_serialization ;
+ requirements += <define>HAVE_PROBINGPT ;
+ requirements += <library>boost_serialization ;
+}
+
project : default-build
<threading>multi
<warnings>on
@@ -140,17 +173,21 @@ project : requirements
;
#Add directories here if you want their incidental targets too (i.e. tests).
-build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses-chart-cmd mira scripts regression-testing ;
+build-projects lm util phrase-extract phrase-extract/syntax-common search moses moses/LM mert moses-cmd mira scripts regression-testing ;
if [ option.get "with-mm" : : "yes" ]
{
alias mm :
+ moses/TranslationModel/UG//ptable-describe-features
+ moses/TranslationModel/UG//count-ptable-features
+ moses/TranslationModel/UG//ptable-lookup
+ moses/TranslationModel/UG//spe-check-coverage
+ moses/TranslationModel/UG/mm//mtt-demo1
moses/TranslationModel/UG/mm//mtt-build
moses/TranslationModel/UG/mm//mtt-dump
moses/TranslationModel/UG/mm//symal2mam
moses/TranslationModel/UG/mm//mam2symal
moses/TranslationModel/UG/mm//mam_verify
- moses/TranslationModel/UG/mm//custom-pt
moses/TranslationModel/UG/mm//mmlex-build
moses/TranslationModel/UG/mm//mmlex-lookup
moses/TranslationModel/UG/mm//mtt-count-words
@@ -163,9 +200,19 @@ else
alias mm ;
}
+if [ option.get "with-rephraser" : : "yes" ]
+{
+ alias rephraser :
+ contrib/rephraser//paraphrase
+ ;
+}
+else
+{
+ alias rephraser ;
+}
+
alias programs :
lm//programs
-moses-chart-cmd//moses_chart
moses-cmd//programs
OnDiskPt//CreateOnDiskPt
OnDiskPt//queryOnDiskPt
@@ -177,15 +224,19 @@ phrase-extract//lexical-reordering
phrase-extract//extract-ghkm
phrase-extract//pcfg-extract
phrase-extract//pcfg-score
+phrase-extract//extract-mixed-syntax
+phrase-extract//score-stsg
+phrase-extract//filter-rule-table
biconcor
mira//mira
contrib/server//mosesserver
mm
+rephraser
;
install-bin-libs programs ;
-install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-chart-cmd moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
+install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
install-headers headers-moses : moses//headers-to-install : moses ;
alias install : prefix-bin prefix-lib headers-base headers-moses ;
@@ -199,3 +250,11 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
echo "To disable this message, delete $(TOP)/dist ." ;
echo ;
}
+
+#local temp = [ _shell "bash source ./s.sh" ] ;
+local temp = [ _shell "mkdir bin" ] ;
+local temp = [ _shell "rm bin/moses_chart" ] ;
+local temp = [ _shell "cd bin && ln -s moses moses_chart" ] ;
+
+
+
diff --git a/NOTICE b/NOTICE
deleted file mode 100644
index 23d8b2ad1..000000000
--- a/NOTICE
+++ /dev/null
@@ -1,5 +0,0 @@
-This code includes data from Daniel Naber's Language Tools (czech abbreviations).
-
-This code includes data from czech wiktionary (also czech abbreviations).
-
-
diff --git a/OnDiskPt/Jamfile b/OnDiskPt/Jamfile
index 473f14cfe..0c25d6275 100644
--- a/OnDiskPt/Jamfile
+++ b/OnDiskPt/Jamfile
@@ -1,5 +1,5 @@
fakelib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp OnDiskQuery.cpp ../moses//headers ;
-exe CreateOnDiskPt : Main.cpp ../moses//moses OnDiskPt ;
-exe queryOnDiskPt : queryOnDiskPt.cpp ../moses//moses OnDiskPt ;
+exe CreateOnDiskPt : Main.cpp ..//boost_filesystem ../moses//moses OnDiskPt ;
+exe queryOnDiskPt : queryOnDiskPt.cpp ..//boost_filesystem ../moses//moses OnDiskPt ;
diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp
index 063caddb3..b0d154b9d 100644
--- a/OnDiskPt/Main.cpp
+++ b/OnDiskPt/Main.cpp
@@ -66,10 +66,9 @@ int main (int argc, char * const argv[])
PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode();
size_t lineNum = 0;
- char line[100000];
+ string line;
- //while(getline(inStream, line))
- while(inStream.getline(line, 100000)) {
+ while(getline(inStream, line)) {
lineNum++;
if (lineNum%1000 == 0) cerr << "." << flush;
if (lineNum%10000 == 0) cerr << ":" << flush;
@@ -107,8 +106,13 @@ bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::Sourc
return ret;
}
-OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
+OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, const std::string &lineStr, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
{
+ char line[lineStr.size() + 1];
+ strcpy(line, lineStr.c_str());
+
+ stringstream sparseFeatures, property;
+
size_t scoreInd = 0;
// MAIN LOOP
@@ -118,6 +122,7 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
2 = scores
3 = align
4 = count
+ 7 = properties
*/
char *tok = strtok (line," ");
OnDiskPt::PhrasePtr out(new Phrase());
@@ -148,29 +153,20 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
targetPhrase.CreateAlignFromString(tok);
break;
}
- case 4:
- ++stage;
- break;
- /* case 5: {
- // count info. Only store the 2nd one
- float val = Moses::Scan<float>(tok);
- misc[0] = val;
- ++stage;
- break;
- }*/
+ case 4: {
+ // store only the 3rd one (rule count)
+ float val = Moses::Scan<float>(tok);
+ misc[0] = val;
+ break;
+ }
case 5: {
- // count info. Only store the 2nd one
- //float val = Moses::Scan<float>(tok);
- //misc[0] = val;
- ++stage;
+ // sparse features
+ sparseFeatures << tok << " ";
break;
}
case 6: {
- // store only the 3rd one (rule count)
- float val = Moses::Scan<float>(tok);
- misc[0] = val;
- ++stage;
- break;
+ property << tok << " ";
+ break;
}
default:
cerr << "ERROR in line " << line << endl;
@@ -183,6 +179,8 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
} // while (tok != NULL)
assert(scoreInd == numScores);
+ targetPhrase.SetSparseFeatures(Moses::Trim(sparseFeatures.str()));
+ targetPhrase.SetProperty(Moses::Trim(property.str()));
targetPhrase.SortAlign();
return out;
} // Tokenize()
diff --git a/OnDiskPt/Main.h b/OnDiskPt/Main.h
index 2b2d585d8..fcdb2cd9d 100644
--- a/OnDiskPt/Main.h
+++ b/OnDiskPt/Main.h
@@ -29,7 +29,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget);
OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
- , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
+ , const std::string &lineStr, OnDiskPt::OnDiskWrapper &onDiskWrapper
, int numScores
, std::vector<float> &misc);
diff --git a/OnDiskPt/OnDiskWrapper.cpp b/OnDiskPt/OnDiskWrapper.cpp
index 12adffd03..91935e965 100644
--- a/OnDiskPt/OnDiskWrapper.cpp
+++ b/OnDiskPt/OnDiskWrapper.cpp
@@ -31,7 +31,7 @@ using namespace std;
namespace OnDiskPt
{
-int OnDiskWrapper::VERSION_NUM = 5;
+int OnDiskWrapper::VERSION_NUM = 7;
OnDiskWrapper::OnDiskWrapper()
{
diff --git a/OnDiskPt/TargetPhrase.cpp b/OnDiskPt/TargetPhrase.cpp
index cb821a557..cb6135d45 100644
--- a/OnDiskPt/TargetPhrase.cpp
+++ b/OnDiskPt/TargetPhrase.cpp
@@ -162,10 +162,14 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
// allocate mem
size_t numScores = onDiskWrapper.GetNumScores()
,numAlign = GetAlign().size();
+ size_t sparseFeatureSize = m_sparseFeatures.size();
+ size_t propSize = m_property.size();
- size_t memNeeded = sizeof(UINT64); // file pos (phrase id)
- memNeeded += sizeof(UINT64) + 2 * sizeof(UINT64) * numAlign; // align
- memNeeded += sizeof(float) * numScores; // scores
+ size_t memNeeded = sizeof(UINT64) // file pos (phrase id)
+ + sizeof(UINT64) + 2 * sizeof(UINT64) * numAlign // align
+ + sizeof(float) * numScores // scores
+ + sizeof(UINT64) + sparseFeatureSize // sparse features string
+ + sizeof(UINT64) + propSize; // property string
char *mem = (char*) malloc(memNeeded);
//memset(mem, 0, memNeeded);
@@ -183,11 +187,33 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
// scores
memUsed += WriteScoresToMemory(mem + memUsed);
+ // sparse features
+ memUsed += WriteStringToMemory(mem + memUsed, m_sparseFeatures);
+
+ // property string
+ memUsed += WriteStringToMemory(mem + memUsed, m_property);
+
//DebugMem(mem, memNeeded);
assert(memNeeded == memUsed);
return mem;
}
+size_t TargetPhrase::WriteStringToMemory(char *mem, const std::string &str) const
+{
+ size_t memUsed = 0;
+ UINT64 *memTmp = (UINT64*) mem;
+
+ size_t strSize = str.size();
+ memTmp[0] = strSize;
+ memUsed += sizeof(UINT64);
+
+ const char *charStr = str.c_str();
+ memcpy(mem + memUsed, charStr, strSize);
+ memUsed += strSize;
+
+ return memUsed;
+}
+
size_t TargetPhrase::WriteAlignToMemory(char *mem) const
{
size_t memUsed = 0;
@@ -231,7 +257,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
, const std::vector<float> &weightT
, bool isSyntax) const
{
- Moses::TargetPhrase *ret = new Moses::TargetPhrase();
+ Moses::TargetPhrase *ret = new Moses::TargetPhrase(&phraseDict);
// words
size_t phraseSize = GetSize();
@@ -279,7 +305,14 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
// scores
ret->GetScoreBreakdown().Assign(&phraseDict, m_scores);
- ret->Evaluate(mosesSP, phraseDict.GetFeaturesToApply());
+
+ // sparse features
+ ret->GetScoreBreakdown().Assign(&phraseDict, m_sparseFeatures);
+
+ // property
+ ret->SetProperties(m_property);
+
+ ret->EvaluateInIsolation(mosesSP, phraseDict.GetFeaturesToApply());
return ret;
}
@@ -299,9 +332,36 @@ UINT64 TargetPhrase::ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPC
memUsed += ReadScoresFromFile(fileTPColl);
assert((memUsed + filePos) == (UINT64)fileTPColl.tellg());
+ // sparse features
+ memUsed += ReadStringFromFile(fileTPColl, m_sparseFeatures);
+
+ // properties
+ memUsed += ReadStringFromFile(fileTPColl, m_property);
+
return memUsed;
}
+UINT64 TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr)
+{
+ UINT64 bytesRead = 0;
+
+ UINT64 strSize;
+ fileTPColl.read((char*) &strSize, sizeof(UINT64));
+ bytesRead += sizeof(UINT64);
+
+ if (strSize) {
+ char *mem = (char*) malloc(strSize + 1);
+ mem[strSize] = '\0';
+ fileTPColl.read(mem, strSize);
+ outStr = string(mem);
+ free(mem);
+
+ bytesRead += strSize;
+ }
+
+ return bytesRead;
+}
+
UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
{
UINT64 bytesRead = 0;
diff --git a/OnDiskPt/TargetPhrase.h b/OnDiskPt/TargetPhrase.h
index 5b8a30296..89b7f967e 100644
--- a/OnDiskPt/TargetPhrase.h
+++ b/OnDiskPt/TargetPhrase.h
@@ -50,15 +50,18 @@ class TargetPhrase: public Phrase
protected:
AlignType m_align;
PhrasePtr m_sourcePhrase;
+ std::string m_sparseFeatures, m_property;
std::vector<float> m_scores;
UINT64 m_filePos;
size_t WriteAlignToMemory(char *mem) const;
size_t WriteScoresToMemory(char *mem) const;
+ size_t WriteStringToMemory(char *mem, const std::string &str) const;
UINT64 ReadAlignFromFile(std::fstream &fileTPColl);
UINT64 ReadScoresFromFile(std::fstream &fileTPColl);
+ UINT64 ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr);
public:
TargetPhrase() {
@@ -110,6 +113,15 @@ public:
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
+ void SetProperty(const std::string &value)
+ {
+ m_property = value;
+ }
+
+ void SetSparseFeatures(const std::string &value)
+ {
+ m_sparseFeatures = value;
+ }
};
}
diff --git a/OnDiskPt/Word.cpp b/OnDiskPt/Word.cpp
index 23d29cc7a..33bdb6cc5 100644
--- a/OnDiskPt/Word.cpp
+++ b/OnDiskPt/Word.cpp
@@ -104,14 +104,20 @@ void Word::ConvertToMoses(
Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
overwrite = Moses::Word(m_isNonTerminal);
- // TODO: this conversion should have been done at load time.
- util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
-
- for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
- UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
- overwrite.SetFactor(*t, factorColl.AddFactor(*tok, m_isNonTerminal));
+ if (m_isNonTerminal) {
+ const std::string &tok = vocab.GetString(m_vocabId);
+ overwrite.SetFactor(0, factorColl.AddFactor(tok, m_isNonTerminal));
+ }
+ else {
+ // TODO: this conversion should have been done at load time.
+ util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
+
+ for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
+ UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
+ overwrite.SetFactor(*t, factorColl.AddFactor(*tok, m_isNonTerminal));
+ }
+ UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
}
- UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
}
int Word::Compare(const Word &compare) const
diff --git a/OnDiskPt/queryOnDiskPt.cpp b/OnDiskPt/queryOnDiskPt.cpp
index a38fc5435..77576d956 100644
--- a/OnDiskPt/queryOnDiskPt.cpp
+++ b/OnDiskPt/queryOnDiskPt.cpp
@@ -22,7 +22,7 @@ int main(int argc, char **argv)
{
int tableLimit = 20;
std::string ttable = "";
- bool useAlignments = false;
+ // bool useAlignments = false;
for(int i = 1; i < argc; i++) {
if(!strcmp(argv[i], "-tlimit")) {
diff --git a/biconcor/Jamfile b/biconcor/Jamfile
index 76f5c7aaf..83a738000 100644
--- a/biconcor/Jamfile
+++ b/biconcor/Jamfile
@@ -1,2 +1,2 @@
exe biconcor : Vocabulary.cpp SuffixArray.cpp TargetCorpus.cpp Alignment.cpp Mismatch.cpp PhrasePair.cpp PhrasePairCollection.cpp biconcor.cpp base64.cpp ;
-
+exe phrase-lookup : Vocabulary.cpp SuffixArray.cpp phrase-lookup.cpp ;
diff --git a/biconcor/phrase-lookup.cpp b/biconcor/phrase-lookup.cpp
new file mode 100644
index 000000000..c6d1b9cdf
--- /dev/null
+++ b/biconcor/phrase-lookup.cpp
@@ -0,0 +1,132 @@
+#include "SuffixArray.h"
+#include <getopt.h>
+
+using namespace std;
+
+size_t lookup( string );
+vector<string> tokenize( const char input[] );
+SuffixArray suffixArray;
+
+int main(int argc, char* argv[]) {
+ // handle parameters
+ string query;
+ string fileNameSuffix;
+ string fileNameSource;
+ int loadFlag = false;
+ int saveFlag = false;
+ int createFlag = false;
+ int queryFlag = false;
+ int stdioFlag = false; // receive requests from STDIN, respond to STDOUT
+ string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
+ while(1) {
+ static struct option long_options[] = {
+ {"load", required_argument, 0, 'l'},
+ {"save", required_argument, 0, 's'},
+ {"create", required_argument, 0, 'c'},
+ {"query", required_argument, 0, 'q'},
+ {"stdio", no_argument, 0, 'i'},
+ {0, 0, 0, 0}
+ };
+ int option_index = 0;
+ int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
+ if (c == -1) break;
+ switch (c) {
+ case 'l':
+ fileNameSuffix = string(optarg);
+ loadFlag = true;
+ break;
+ case 's':
+ fileNameSuffix = string(optarg);
+ saveFlag = true;
+ break;
+ case 'c':
+ fileNameSource = string(optarg);
+ createFlag = true;
+ break;
+ case 'q':
+ query = string(optarg);
+ queryFlag = true;
+ break;
+ case 'i':
+ stdioFlag = true;
+ break;
+ default:
+ cerr << info;
+ exit(1);
+ }
+ }
+ if (stdioFlag) {
+ queryFlag = true;
+ }
+
+ // check if parameter settings are legal
+ if (saveFlag && !createFlag) {
+ cerr << "error: cannot save without creating\n" << info;
+ exit(1);
+ }
+ if (saveFlag && loadFlag) {
+ cerr << "error: cannot load and save at the same time\n" << info;
+ exit(1);
+ }
+ if (!loadFlag && !createFlag) {
+ cerr << "error: neither load or create - i have no info!\n" << info;
+ exit(1);
+ }
+
+ // do your thing
+ if (createFlag) {
+ cerr << "will create\n";
+ cerr << "corpus is in " << fileNameSource << endl;
+ suffixArray.Create( fileNameSource );
+ if (saveFlag) {
+ suffixArray.Save( fileNameSuffix );
+ cerr << "will save in " << fileNameSuffix << endl;
+ }
+ }
+ if (loadFlag) {
+ cerr << "will load from " << fileNameSuffix << endl;
+ suffixArray.Load( fileNameSuffix );
+ }
+ if (stdioFlag) {
+ while(true) {
+ string query;
+ if (getline(cin, query, '\n').eof()) {
+ return 0;
+ }
+ cout << lookup( query ) << endl;
+ }
+ }
+ else if (queryFlag) {
+ cout << lookup( query ) << endl;
+ }
+ return 0;
+}
+
+size_t lookup( string query ) {
+ cerr << "query is " << query << endl;
+ vector< string > queryString = tokenize( query.c_str() );
+ return suffixArray.Count( queryString );
+}
+
+vector<string> tokenize( const char input[] )
+{
+ vector< string > token;
+ bool betweenWords = true;
+ int start=0;
+ int i=0;
+ for(; input[i] != '\0'; i++) {
+ bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+ if (!isSpace && betweenWords) {
+ start = i;
+ betweenWords = false;
+ } else if (isSpace && !betweenWords) {
+ token.push_back( string( input+start, i-start ) );
+ betweenWords = true;
+ }
+ }
+ if (!betweenWords)
+ token.push_back( string( input+start, i-start ) );
+ return token;
+}
+
diff --git a/contrib/makemteval/makemteval.ini b/contrib/makemteval/makemteval.ini
new file mode 100644
index 000000000..5493464d7
--- /dev/null
+++ b/contrib/makemteval/makemteval.ini
@@ -0,0 +1,12 @@
+[set]
+filein=
+fileout=
+settype=
+srclang=
+tstlang=
+setid=SetID
+refid=RefID
+sysid=SysID
+docid=DocID
+genre=Genre
+
diff --git a/contrib/makemteval/makemteval.py b/contrib/makemteval/makemteval.py
new file mode 100644
index 000000000..11b428ac6
--- /dev/null
+++ b/contrib/makemteval/makemteval.py
@@ -0,0 +1,253 @@
+#! /usr/bin/env python
+# -*- coding: utf8 -*-
+
+#===============================================================================
+# Author: Walapa Muangjeen
+#===============================================================================
+
+
+__version__ = '2.0'
+
+import sys
+import os
+import codecs
+import ConfigParser
+from optparse import OptionParser
+from copy import deepcopy
+
+
+class makemteval:
+
+ def __init__(self, config=None):
+
+ if isinstance(config,dict):
+ self.config = deepcopy(config)
+ else:
+ self.config = {
+ 'filein': None,
+ 'fileout': None,
+ 'settype': None,
+ 'srclang': None,
+ 'tstlang': None,
+ 'setid': 'SetID',
+ 'refid': 'RefID',
+ 'sysid': 'SysID',
+ 'docid': 'DocID',
+ 'genre': 'Genre',
+ }
+
+
+ def parseini(self, config=None, inifile=None, section='set'):
+
+ if inifile is None:
+ inifile = os.path.abspath(os.path.dirname(sys.argv[0])) + os.sep + os.path.splitext(os.path.basename(sys.argv[0]))[0] + '.ini'
+
+ if config is None:
+ config = self.config
+
+ cfgparser = ConfigParser.RawConfigParser()
+
+ if not cfgparser.has_section(section):
+ cfgparser.add_section(section)
+
+ for option in config:
+ cfgparser.set(section, option, config[option])
+
+ cfgparser.read(inifile)
+
+ for option in cfgparser.options(section):
+ config[option] = cfgparser.get(section, option)
+
+ return deepcopy(config)
+
+
+ def writesgm( self, config ):
+
+ try:
+ filein = codecs.open(os.path.abspath(os.path.expanduser(config['filein'])), "r", 'utf-8-sig')
+ except IOError, ErrorMessage:
+ sys.stderr.write("\n: %s\n"%(ErrorMessage))
+ sys.stderr.write(": End Program\n")
+ return True
+
+ if __name__ == "__main__":
+ sys.stderr.write( ": opened \"%s\" for reading\n"%(os.path.basename( config['filein'] )))
+
+ lines = [l.replace('&quot;','\"').replace('&apos;','\'').replace('&gt;','>').replace('&lt;','<').replace('&amp;','&') for l in filein.read().splitlines()]
+ filein.close()
+ lines = [l.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;').replace('\'','&apos;').replace('\"','&quot;') for l in lines]
+
+ if __name__ == "__main__":
+ sys.stderr.write(": closed \"%s\"\n"%(os.path.basename( config['filein'] )))
+
+ try:
+ fileout = codecs.open(os.path.abspath(os.path.expanduser(config['fileout'])), "w", 'utf8')
+ except IOError, ErrorMessage:
+ sys.stderr.write("\n: %s\n"%(ErrorMessage))
+ sys.stderr.write(": End Program\n")
+ return True
+
+ if __name__ == "__main__":
+ sys.stderr.write(": opened \"%s\" for writing\n"%(os.path.basename( config['fileout'] )))
+
+ contents = []
+ contents.append('<?xml version=\"1.0\" encoding=\"UTF-8\"?>')
+ contents.append('<!DOCTYPE mteval SYSTEM \"ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-xml-v1.3.dtd\">')
+ contents.append('<mteval>')
+
+ if config['settype'] == "srcset":
+ contents.append("<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\">"%(config))
+
+ elif config['settype'] == "refset":
+ contents.append('<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\" trglang=\"%(tstlang)s\" refid=\"%(refid)s\">'%(config))
+
+ elif config['settype'] == "tstset":
+ contents.append('<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\" trglang=\"%(tstlang)s\" sysid=\"%(sysid)s\" sysbleu=\"%(sysbleu)s\" language=\"%(language)s\">'%(config))
+
+ else:
+ fileout.close()
+ os.unlink(os.path.abspath(os.path.expanduser(config['fileout'])))
+ sys.stderr.write("\n: Invalid \"settype\" value %s\n"%(config['settype']))
+ sys.stderr.write(": End Program\n")
+ return True
+
+ contents.append('<DOC %sdocid=\"%s\" genre=\"%s\">'%('' if config['settype'] == "srcset" else 'sysid=\"%s\" '%(config['sysid']),config['docid'],config['genre']))
+
+ if __name__ == "__main__":
+ sys.stderr.write(": added header\n")
+
+ for i in range(len(lines)):
+ contents.append('<seg id=\"%d\"> %s </seg>'%(i+1,lines[i]))
+
+ if __name__ == "__main__":
+ sys.stderr.write(": added %d lines\n"%(i+1))
+
+ contents.append('</DOC>')
+ contents.append('</%s>'%(config['settype']))
+ contents.append('</mteval>')
+
+ if __name__ == "__main__":
+ sys.stderr.write(": added footer\n")
+
+ fileout.write('%s\n'%('\n'.join(contents)))
+ ferror = fileout.close()
+
+ if __name__ == "__main__":
+ sys.stderr.write(": closed \"" + os.path.basename( config['fileout'] ) + "\"\n")
+
+ return ferror
+
+
+def parsecmd( config = {} ):
+
+ optparser = OptionParser()
+
+ optparser.add_option(
+ "-i", "--filein", dest = "filein", default = config["filein"],
+ help = "UNC path to tokenized input file (required)")
+
+ optparser.add_option(
+ "-o", "--fileout", dest = "fileout", default = config["fileout"],
+ help = "UNC path of fileout file (required)")
+
+ optparser.add_option(
+ "-s", "--srclang", dest = "srclang", default = config["srclang"],
+ help = "2-letter code for source language (required)")
+
+ optparser.add_option(
+ "-t", "--tstlang", dest = "tstlang", default = config["tstlang"],
+ help = "2-letter code for test language (required)")
+
+ optparser.add_option(
+ "-T", "--settype", dest = "settype", default = config["settype"],
+ help = "Use XML tag: srcset, tstset or refset (required)")
+
+ optparser.add_option(
+ "-e", "--setid", dest = "setid", default = config["setid"],
+ help = "Test set ID (default \""+config["setid"]+"\")")
+
+ optparser.add_option(
+ "-d", "--docid", dest = "docid", default = config["docid"],
+ help = "Document ID (default \""+config["docid"]+"\")")
+
+ optparser.add_option(
+ "-r", "--refid", dest = "refid", default = config["refid"],
+ help = "Reference ID (default \""+config["refid"]+"\")")
+
+ optparser.add_option(
+ "-S", "--sysid", dest = "sysid", default = config["sysid"],
+ help = "System ID used to make the test set (default \""+config["sysid"]+"\")")
+
+ optparser.add_option(
+ "-g", "--genre", dest = "genre", default = config["genre"],
+ help = "Genre of the test set and system ID (default \""+config["genre"]+"\")")
+
+ options, commands = optparser.parse_args()
+
+ missing = []
+ for k,v in { "Error: missing --filein" : options.filein,
+ "Error: missing --fileout": options.fileout,
+ "Error: missing --settype": options.settype,
+ "Error: missing --srclang": options.srclang,
+ "Error: missing --tstlang": options.tstlang
+ }.items():
+ if not v:
+ missing.append(k)
+
+ if missing:
+ for msg in missing:
+ sys.stderr.write('%s\n'%(msg))
+ optparser.print_help()
+ exit(1)
+
+ config['filein'] = options.filein
+ config['fileout'] = options.fileout
+ config['settype'] = options.settype
+ config['setid'] = options.setid
+ config['srclang'] = options.srclang
+ config['tstlang'] = options.tstlang
+ config['refid'] = options.refid
+ config['sysid'] = options.sysid
+ config['docid'] = options.docid
+ config['genre'] = options.genre
+
+ sys.stderr.write(": Configuration complete\n")
+
+ return
+
+
+licensetxt=u'''CorpusFiltergraphâ„¢
+Copyright © 2010-2014 Precision Translation Tools Co., Ltd.
+
+This module is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program. If not, see http://www.gnu.org/licenses/.
+
+For more information, please contact Precision Translation Tools Pte
+at: http://www.precisiontranslationtools.com'''
+
+
+def main():
+
+ mksgm = makemteval()
+
+ mksgm.parseini(mksgm.config)
+
+ parsecmd(mksgm.config)
+
+ mksgm.writesgm(mksgm.config)
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/contrib/moses-speedtest/README.md b/contrib/moses-speedtest/README.md
new file mode 100644
index 000000000..c95c6a400
--- /dev/null
+++ b/contrib/moses-speedtest/README.md
@@ -0,0 +1,122 @@
+# Moses speedtesting framework
+
+### Description
+
+This is an automatic test framework that is designed to test the day to day performance changes in Moses.
+
+### Set up
+
+#### Set up a Moses repo
+Set up a Moses repo and build it with the desired configuration.
+```bash
+git clone https://github.com/moses-smt/mosesdecoder.git
+cd mosesdecoder
+./bjam -j10 --with-cmph=/usr/include/
+```
+You need to build Moses first, so that the testsuite knows what command you want it to use when rebuilding against newer revisions.
+
+#### Create a parent directory.
+Create a parent directory where the **runtests.py** and related scripts and configuration file should reside.
+This should also be the location of the TEST_DIR and TEST_LOG_DIR as explained in the next section.
+
+#### Set up a global configuration file.
+You need a configuration file for the testsuite. A sample configuration file is provided in **testsuite\_config**
+<pre>
+MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder
+DROP_CACHES_COMM: sys_drop_caches 3
+TEST_DIR: /home/moses-speedtest/phrase_tables/tests
+TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
+BASEBRANCH: RELEASE-2.1.1
+</pre>
+
+The _MOSES\_REPO\_PATH_ is the place where you have set up and built moses.
+The _DROP\_CACHES\_COMM_ is the command that would beused to drop caches. It should run without needing root access.
+_TEST\_DIR_ is the directory where all the tests will reside.
+_TEST\_LOG\_DIR_ is the directory where the performance logs will be gathered. It should be created before running the testsuite for the first time.
+_BASEBRANCH_ is the branch against which all new tests will be compared. It should normally be set to be the latest Moses stable release.
+
+### Creating tests
+
+In order to create a test one should go into the TEST_DIR and create a new folder. That folder will be used for the name of the test.
+Inside that folder one should place a configuration file named **config**. The naming is mandatory.
+An example such configuration file is **test\_config**
+
+<pre>
+Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
+LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/,
+Variants: vanilla, cached, ldpre #Can't have cached without ldpre or vanilla
+</pre>
+
+The _Command:_ line specifies the executable (which is looked up in the /bin directory of the repo.) and any arguments necessary. Before running the test, the script cds to the current test directory so you can use relative paths.
+The _LDPRE:_ specifies if tests should be run with any LD\_PRELOAD flags.
+The _Variants:_ line specifies what type of tests should we run. This particular line will run the following tests:
+1. A Vanilla test meaning just the command after _Command_ will be issued.
+2. A vanilla cached test meaning that after the vanilla test, the test will be run again without dropping caches in order to benchmark performance on cached filesystem.
+3. A test with LD_PRELOAD ldpreloads moses -f command. For each available LDPRELOAD comma separated library to preload.
+4. A cached version of all LD_PRELOAD tests.
+
+### Running tests.
+Running the tests is done through the **runtests.py** script.
+
+#### Running all tests.
+To run all tests, with the base branch and the latests revision (and generate new basebranch test data if such is missing) do a:
+```bash
+python3 runtests.py -c testsuite_config
+```
+
+#### Running specific tests.
+The script allows the user to manually run a particular test or to test against a specific branch or revision:
+<pre>
+moses-speedtest@crom:~/phrase_tables$ python3 runtests.py --help
+usage: runtests.py [-h] -c CONFIGFILE [-s SINGLETESTDIR] [-r REVISION]
+ [-b BRANCH]
+
+A python based speedtest suite for moses.
+
+optional arguments:
+ -h, --help show this help message and exit
+ -c CONFIGFILE, --configfile CONFIGFILE
+ Specify test config file
+ -s SINGLETESTDIR, --singletest SINGLETESTDIR
+ Single test name directory. Specify directory name,
+ not full path!
+ -r REVISION, --revision REVISION
+ Specify a specific revison for the test.
+ -b BRANCH, --branch BRANCH
+ Specify a branch for the test.
+</pre>
+
+### Generating HTML report.
+To generate a summary of the test results use the **html\_gen.py** script. It places a file named *index.html* in the current script directory.
+```bash
+python3 html_gen.py testsuite_config
+```
+You should use the generated file with the **style.css** file provided in the html directory.
+
+### Command line regression testing.
+Alternatively you could check for regressions from the command line using the **check\_fo\r_regression.py** script:
+```bash
+python3 check_for_regression.py TESTLOGS_DIRECTORY
+```
+
+Alternatively the results of all tests are logged inside the the specified TESTLOGS directory so you can manually check them for additional information such as date, time, revision, branch, etc...
+
+### Create a cron job:
+Create a cron job to run the tests daily and generate an html report. An example *cronjob* is available.
+```bash
+#!/bin/sh
+cd /home/moses-speedtest/phrase_tables
+
+python3 runtests.py -c testsuite_config #Run the tests.
+python3 html_gen.py testsuite_config #Generate html
+
+cp index.html /fs/thor4/html/www/speed-test/ #Update the html
+```
+
+Place the script in _/etc/cron.daily_ for dayly testing
+
+###### Author
+Nikolay Bogoychev, 2014
+
+###### License
+This software is licensed under the LGPL. \ No newline at end of file
diff --git a/contrib/moses-speedtest/check_for_regression.py b/contrib/moses-speedtest/check_for_regression.py
new file mode 100644
index 000000000..1e269c0c6
--- /dev/null
+++ b/contrib/moses-speedtest/check_for_regression.py
@@ -0,0 +1,63 @@
+"""Checks if any of the latests tests has performed considerably different than
+ the previous ones. Takes the log directory as an argument."""
+import os
+import sys
+from testsuite_common import Result, processLogLine, bcolors, getLastTwoLines
+
+LOGDIR = sys.argv[1] #Get the log directory as an argument
+PERCENTAGE = 5 #Default value for how much a test shoudl change
+if len(sys.argv) == 3:
+ PERCENTAGE = float(sys.argv[2]) #Default is 5%, but we can specify more
+ #line parameter
+
+def printResults(regressed, better, unchanged, firsttime):
+ """Pretty print the results in different colours"""
+ if regressed != []:
+ for item in regressed:
+ print(bcolors.RED + "REGRESSION! " + item.testname + " Was: "\
+ + str(item.previous) + " Is: " + str(item.current) + " Change: "\
+ + str(abs(item.percentage)) + "%. Revision: " + item.revision\
+ + bcolors.ENDC)
+ print('\n')
+ if unchanged != []:
+ for item in unchanged:
+ print(bcolors.BLUE + "UNCHANGED: " + item.testname + " Revision: " +\
+ item.revision + bcolors.ENDC)
+ print('\n')
+ if better != []:
+ for item in better:
+ print(bcolors.GREEN + "IMPROVEMENT! " + item.testname + " Was: "\
+ + str(item.previous) + " Is: " + str(item.current) + " Change: "\
+ + str(abs(item.percentage)) + "%. Revision: " + item.revision\
+ + bcolors.ENDC)
+ if firsttime != []:
+ for item in firsttime:
+ print(bcolors.PURPLE + "First time test! " + item.testname +\
+ " Took: " + str(item.real) + " seconds. Revision: " +\
+ item.revision + bcolors.ENDC)
+
+
+all_files = os.listdir(LOGDIR)
+regressed = []
+better = []
+unchanged = []
+firsttime = []
+
+#Go through all log files and find which tests have performed better.
+for logfile in all_files:
+ (line1, line2) = getLastTwoLines(logfile, LOGDIR)
+ log1 = processLogLine(line1)
+ if line2 == '\n': # Empty line, only one test ever run
+ firsttime.append(log1)
+ continue
+ log2 = processLogLine(line2)
+ res = Result(log1.testname, log1.real, log2.real, log2.revision,\
+ log2.branch, log1.revision, log1.branch)
+ if res.percentage < -PERCENTAGE:
+ regressed.append(res)
+ elif res.change > PERCENTAGE:
+ better.append(res)
+ else:
+ unchanged.append(res)
+
+printResults(regressed, better, unchanged, firsttime)
diff --git a/contrib/moses-speedtest/cronjob b/contrib/moses-speedtest/cronjob
new file mode 100644
index 000000000..4f7183a48
--- /dev/null
+++ b/contrib/moses-speedtest/cronjob
@@ -0,0 +1,7 @@
+#!/bin/sh
+cd /home/moses-speedtest/phrase_tables
+
+python3 runtests.py -c testsuite_config #Run the tests.
+python3 html_gen.py testsuite_config #Generate html
+
+cp index.html /fs/thor4/html/www/speed-test/ #Update the html \ No newline at end of file
diff --git a/contrib/moses-speedtest/helpers/README.md b/contrib/moses-speedtest/helpers/README.md
new file mode 100644
index 000000000..87efbc78f
--- /dev/null
+++ b/contrib/moses-speedtest/helpers/README.md
@@ -0,0 +1,5 @@
+###Helpers
+
+This is a python script that basically gives you the equivalent of:
+```echo 3 > /proc/sys/vm/drop_caches```
+You need to set it up so it is executed with root access without needing a password so that the tests can be automated. \ No newline at end of file
diff --git a/contrib/moses-speedtest/helpers/sys_drop_caches.py b/contrib/moses-speedtest/helpers/sys_drop_caches.py
new file mode 100644
index 000000000..d4796e090
--- /dev/null
+++ b/contrib/moses-speedtest/helpers/sys_drop_caches.py
@@ -0,0 +1,22 @@
+#!/usr/bin/spython
+from sys import argv, stderr, exit
+from os import linesep as ls
+procfile = "/proc/sys/vm/drop_caches"
+options = ["1","2","3"]
+flush_type = None
+try:
+ flush_type = argv[1][0:1]
+ if not flush_type in options:
+ raise IndexError, "not in options"
+ with open(procfile, "w") as f:
+ f.write("%s%s" % (flush_type,ls))
+ exit(0)
+except IndexError, e:
+ stderr.write("Argument %s required.%s" % (options, ls))
+except IOError, e:
+ stderr.write("Error writing to file.%s" % ls)
+except StandardError, e:
+ stderr.write("Unknown Error.%s" % ls)
+
+exit(1)
+
diff --git a/contrib/moses-speedtest/html/README.md b/contrib/moses-speedtest/html/README.md
new file mode 100644
index 000000000..342a8cedf
--- /dev/null
+++ b/contrib/moses-speedtest/html/README.md
@@ -0,0 +1,5 @@
+###HTML files.
+
+_index.html_ is a sample generated file by this testsuite.
+
+_style.css_ should be placed in the html directory in which _index.html_ will be placed in order to visualize the test results in a browser.
diff --git a/contrib/moses-speedtest/html/index.html b/contrib/moses-speedtest/html/index.html
new file mode 100644
index 000000000..fc75b1028
--- /dev/null
+++ b/contrib/moses-speedtest/html/index.html
@@ -0,0 +1,32 @@
+<html>
+<head>
+<title>Moses speed testing</title>
+<link rel="stylesheet" type="text/css" href="style.css"></head><body><text><b>Basebranch:</b> RELEASE-2.1 <b>Revision:</b> c977ca2f434ed6f12a352806c088061c492b1676</text><table><tr class="heading">
+ <th>Date</th>
+ <th>Time</th>
+ <th>Testname</th>
+ <th>Revision</th>
+ <th>Branch</th>
+ <th>Time</th>
+ <th>Prevtime</th>
+ <th>Prevrev</th>
+ <th>Change (%)</th>
+ <th>Time (Basebranch)</th>
+ <th>Change (%, Basebranch)</th>
+ <th>Time (Days -2)</th>
+ <th>Change (%, Days -2)</th>
+ <th>Time (Days -3)</th>
+ <th>Change (%, Days -3)</th>
+ <th>Time (Days -4)</th>
+ <th>Change (%, Days -4)</th>
+ <th>Time (Days -5)</th>
+ <th>Change (%, Days -5)</th>
+ <th>Time (Days -6)</th>
+ <th>Change (%, Days -6)</th>
+ <th>Time (Days -7)</th>
+ <th>Change (%, Days -7)</th>
+ <th>Time (Days -14)</th>
+ <th>Change (%, Days -14)</th>
+ <th>Time (Years -1)</th>
+ <th>Change (%, Years -1)</th>
+ </tr><tr><td>10.06.2014</td><td>10:27:57</td><td>ondisk_minreord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>21.36</td><td>21.49</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.006</td><td>25.89</td><td class="better">0.1699</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:29:38</td><td>minpt_reord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>9.73</td><td>9.52</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0221</td><td>12.2</td><td class="better">0.2197</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:22:32</td><td>ondisk_hierarchical_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>25.73</td><td>25.77</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.0016</td><td>33.63</td><td class="better">0.2337</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:22:06</td><td>ondisk_hierarchical_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>83.2</td><td>82.6</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0073</td><td>127.59</td><td class="better">0.3526</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:28:57</td><td>binary_reord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>24.54</td><td>24.85</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.0125</td><td>29.09</td><td class="better">0.1458</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:28:08</td><td>ondisk_minreord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>10.71</td><td>10.54</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0161</td><td>14.82</td><td class="better">0.2888</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:30:00</td><td>binary_minreord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>20.82</td><td>20.77</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0024</td><td>25.77</td><td class="better">0.194</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:27:35</td><td>score.hiero_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>131.37</td><td>130.63</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0057</td><td>141.85</td><td class="better">0.0791</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:29:10</td><td>binary_reord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>13.41</td><td>13.4</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0007</td><td>18.12</td><td class="better">0.2605</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:29:28</td><td>minpt_reord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>17.46</td><td>17.37</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0052</td><td>20.0</td><td class="better">0.1315</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:28:22</td><td>minpt_minreord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>13.75</td><td>13.56</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.014</td><td>17.19</td><td class="better">0.2112</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:22:59</td><td>ondisk_reord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>25.28</td><td>25.0</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0112</td><td>29.11</td><td class="better">0.1412</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:28:31</td><td>minpt_minreord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>8.63</td><td>8.6</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0035</td><td>11.78</td><td class="better">0.2699</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:23:10</td><td>ondisk_reord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>11.57</td><td>11.59</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.0017</td><td>15.4</td><td class="better">0.2474</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:25:24</td><td>score.hiero_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>132.33</td><td>130.02</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0178</td><td>141.35</td><td class="better">0.0802</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:30:12</td><td>binary_minreord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>12.47</td><td>12.61</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.0111</td><td>17.89</td><td class="better">0.2951</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr></table></body></html>
diff --git a/contrib/moses-speedtest/html/style.css b/contrib/moses-speedtest/html/style.css
new file mode 100644
index 000000000..16221f91f
--- /dev/null
+++ b/contrib/moses-speedtest/html/style.css
@@ -0,0 +1,21 @@
+table,th,td
+{
+border:1px solid black;
+ border-collapse:collapse
+}
+
+tr:nth-child(odd) {
+ background-color: Gainsboro;
+}
+
+.better {
+ color: Green;
+}
+
+.worse {
+ color: Red;
+}
+
+.unchanged {
+ color: SkyBlue;
+} \ No newline at end of file
diff --git a/contrib/moses-speedtest/html_gen.py b/contrib/moses-speedtest/html_gen.py
new file mode 100644
index 000000000..80e88329c
--- /dev/null
+++ b/contrib/moses-speedtest/html_gen.py
@@ -0,0 +1,192 @@
+"""Generates HTML page containing the testresults"""
+from testsuite_common import Result, processLogLine, getLastTwoLines
+from runtests import parse_testconfig
+import os
+import sys
+
+from datetime import datetime, timedelta
+
+HTML_HEADING = """<html>
+<head>
+<title>Moses speed testing</title>
+<link rel="stylesheet" type="text/css" href="style.css"></head><body>"""
+HTML_ENDING = "</table></body></html>\n"
+
+TABLE_HEADING = """<table><tr class="heading">
+ <th>Date</th>
+ <th>Time</th>
+ <th>Testname</th>
+ <th>Revision</th>
+ <th>Branch</th>
+ <th>Time</th>
+ <th>Prevtime</th>
+ <th>Prevrev</th>
+ <th>Change (%)</th>
+ <th>Time (Basebranch)</th>
+ <th>Change (%, Basebranch)</th>
+ <th>Time (Days -2)</th>
+ <th>Change (%, Days -2)</th>
+ <th>Time (Days -3)</th>
+ <th>Change (%, Days -3)</th>
+ <th>Time (Days -4)</th>
+ <th>Change (%, Days -4)</th>
+ <th>Time (Days -5)</th>
+ <th>Change (%, Days -5)</th>
+ <th>Time (Days -6)</th>
+ <th>Change (%, Days -6)</th>
+ <th>Time (Days -7)</th>
+ <th>Change (%, Days -7)</th>
+ <th>Time (Days -14)</th>
+ <th>Change (%, Days -14)</th>
+ <th>Time (Years -1)</th>
+ <th>Change (%, Years -1)</th>
+ </tr>"""
+
+def get_prev_days(date, numdays):
+ """Gets the date numdays previous days so that we could search for
+ that test in the config file"""
+ date_obj = datetime.strptime(date, '%d.%m.%Y').date()
+ past_date = date_obj - timedelta(days=numdays)
+ return past_date.strftime('%d.%m.%Y')
+
+def gather_necessary_lines(logfile, date):
+ """Gathers the necessary lines corresponding to past dates
+ and parses them if they exist"""
+ #Get a dictionary of dates
+ dates = {}
+ dates[get_prev_days(date, 2)] = ('-2', None)
+ dates[get_prev_days(date, 3)] = ('-3', None)
+ dates[get_prev_days(date, 4)] = ('-4', None)
+ dates[get_prev_days(date, 5)] = ('-5', None)
+ dates[get_prev_days(date, 6)] = ('-6', None)
+ dates[get_prev_days(date, 7)] = ('-7', None)
+ dates[get_prev_days(date, 14)] = ('-14', None)
+ dates[get_prev_days(date, 365)] = ('-365', None)
+
+ openfile = open(logfile, 'r')
+ for line in openfile:
+ if line.split()[0] in dates.keys():
+ day = dates[line.split()[0]][0]
+ dates[line.split()[0]] = (day, processLogLine(line))
+ openfile.close()
+ return dates
+
+def append_date_to_table(resline):
+ """Appends past dates to the html"""
+ cur_html = '<td>' + str(resline.previous) + '</td>'
+
+ if resline.percentage > 0.05: #If we have improvement of more than 5%
+ cur_html = cur_html + '<td class="better">' + str(resline.percentage) + '</td>'
+ elif resline.percentage < -0.05: #We have a regression of more than 5%
+ cur_html = cur_html + '<td class="worse">' + str(resline.percentage) + '</td>'
+ else:
+ cur_html = cur_html + '<td class="unchanged">' + str(resline.percentage) + '</td>'
+ return cur_html
+
+def compare_rev(filename, rev1, rev2, branch1=False, branch2=False):
+ """Compare the test results of two lines. We can specify either a
+ revision or a branch for comparison. The first rev should be the
+ base version and the second revision should be the later version"""
+
+ #In the log file the index of the revision is 2 but the index of
+ #the branch is 12. Alternate those depending on whether we are looking
+ #for a specific revision or branch.
+ firstidx = 2
+ secondidx = 2
+ if branch1 == True:
+ firstidx = 12
+ if branch2 == True:
+ secondidx = 12
+
+ rev1line = ''
+ rev2line = ''
+ resfile = open(filename, 'r')
+ for line in resfile:
+ if rev1 == line.split()[firstidx]:
+ rev1line = line
+ elif rev2 == line.split()[secondidx]:
+ rev2line = line
+ if rev1line != '' and rev2line != '':
+ break
+ resfile.close()
+ if rev1line == '':
+ raise ValueError('Revision ' + rev1 + " was not found!")
+ if rev2line == '':
+ raise ValueError('Revision ' + rev2 + " was not found!")
+
+ logLine1 = processLogLine(rev1line)
+ logLine2 = processLogLine(rev2line)
+ res = Result(logLine1.testname, logLine1.real, logLine2.real,\
+ logLine2.revision, logLine2.branch, logLine1.revision, logLine1.branch)
+
+ return res
+
+def produce_html(path, global_config):
+ """Produces html file for the report."""
+ html = '' #The table HTML
+ for filenam in os.listdir(global_config.testlogs):
+ #Generate html for the newest two lines
+ #Get the lines from the config file
+ (ll1, ll2) = getLastTwoLines(filenam, global_config.testlogs)
+ logLine1 = processLogLine(ll1)
+ logLine2 = processLogLine(ll2) #This is the life from the latest revision
+
+ #Generate html
+ res1 = Result(logLine1.testname, logLine1.real, logLine2.real,\
+ logLine2.revision, logLine2.branch, logLine1.revision, logLine1.branch)
+ html = html + '<tr><td>' + logLine2.date + '</td><td>' + logLine2.time + '</td><td>' +\
+ res1.testname + '</td><td>' + res1.revision[:10] + '</td><td>' + res1.branch + '</td><td>' +\
+ str(res1.current) + '</td><td>' + str(res1.previous) + '</td><td>' + res1.prevrev[:10] + '</td>'
+
+ #Add fancy colours depending on the change
+ if res1.percentage > 0.05: #If we have improvement of more than 5%
+ html = html + '<td class="better">' + str(res1.percentage) + '</td>'
+ elif res1.percentage < -0.05: #We have a regression of more than 5%
+ html = html + '<td class="worse">' + str(res1.percentage) + '</td>'
+ else:
+ html = html + '<td class="unchanged">' + str(res1.percentage) + '</td>'
+
+ #Get comparison against the base version
+ filenam = global_config.testlogs + '/' + filenam #Get proper directory
+ res2 = compare_rev(filenam, global_config.basebranch, res1.revision, branch1=True)
+ html = html + '<td>' + str(res2.previous) + '</td>'
+
+ #Add fancy colours depending on the change
+ if res2.percentage > 0.05: #If we have improvement of more than 5%
+ html = html + '<td class="better">' + str(res2.percentage) + '</td>'
+ elif res2.percentage < -0.05: #We have a regression of more than 5%
+ html = html + '<td class="worse">' + str(res2.percentage) + '</td>'
+ else:
+ html = html + '<td class="unchanged">' + str(res2.percentage) + '</td>'
+
+ #Add extra dates comparison dating from the beginning of time if they exist
+ past_dates = list(range(2, 8))
+ past_dates.append(14)
+ past_dates.append(365) # Get the 1 year ago day
+ linesdict = gather_necessary_lines(filenam, logLine2.date)
+
+ for days in past_dates:
+ act_date = get_prev_days(logLine2.date, days)
+ if linesdict[act_date][1] is not None:
+ logline_date = linesdict[act_date][1]
+ restemp = Result(logline_date.testname, logline_date.real, logLine2.real,\
+ logLine2.revision, logLine2.branch, logline_date.revision, logline_date.branch)
+ html = html + append_date_to_table(restemp)
+ else:
+ html = html + '<td>N/A</td><td>N/A</td>'
+
+
+
+ html = html + '</tr>' #End row
+
+ #Write out the file
+ basebranch_info = '<text><b>Basebranch:</b> ' + res2.prevbranch + ' <b>Revision:</b> ' +\
+ res2.prevrev + '</text>'
+ writeoutstr = HTML_HEADING + basebranch_info + TABLE_HEADING + html + HTML_ENDING
+ writefile = open(path, 'w')
+ writefile.write(writeoutstr)
+ writefile.close()
+
+if __name__ == '__main__':
+ CONFIG = parse_testconfig(sys.argv[1])
+ produce_html('index.html', CONFIG)
diff --git a/contrib/moses-speedtest/runtests.py b/contrib/moses-speedtest/runtests.py
new file mode 100644
index 000000000..0978c8ef2
--- /dev/null
+++ b/contrib/moses-speedtest/runtests.py
@@ -0,0 +1,293 @@
+"""Given a config file, runs tests"""
+import os
+import subprocess
+import time
+from argparse import ArgumentParser
+from testsuite_common import processLogLine
+
+def parse_cmd():
+ """Parse the command line arguments"""
+ description = "A python based speedtest suite for moses."
+ parser = ArgumentParser(description=description)
+ parser.add_argument("-c", "--configfile", action="store",\
+ dest="configfile", required=True,\
+ help="Specify test config file")
+ parser.add_argument("-s", "--singletest", action="store",\
+ dest="singletestdir", default=None,\
+ help="Single test name directory. Specify directory name,\
+ not full path!")
+ parser.add_argument("-r", "--revision", action="store",\
+ dest="revision", default=None,\
+ help="Specify a specific revison for the test.")
+ parser.add_argument("-b", "--branch", action="store",\
+ dest="branch", default=None,\
+ help="Specify a branch for the test.")
+
+ arguments = parser.parse_args()
+ return arguments
+
+def repoinit(testconfig):
+ """Determines revision and sets up the repo."""
+ revision = ''
+ #Update the repo
+ os.chdir(testconfig.repo)
+ #Checkout specific branch, else maintain main branch
+ if testconfig.branch != 'master':
+ subprocess.call(['git', 'checkout', testconfig.branch])
+ rev, _ = subprocess.Popen(['git', 'rev-parse', 'HEAD'],\
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+ revision = str(rev).replace("\\n'", '').replace("b'", '')
+ else:
+ subprocess.call(['git checkout master'], shell=True)
+
+ #Check a specific revision. Else checkout master.
+ if testconfig.revision:
+ subprocess.call(['git', 'checkout', testconfig.revision])
+ revision = testconfig.revision
+ elif testconfig.branch == 'master':
+ subprocess.call(['git pull'], shell=True)
+ rev, _ = subprocess.Popen(['git rev-parse HEAD'], stdout=subprocess.PIPE,\
+ stderr=subprocess.PIPE, shell=True).communicate()
+ revision = str(rev).replace("\\n'", '').replace("b'", '')
+
+ return revision
+
+class Configuration:
+ """A simple class to hold all of the configuration constatns"""
+ def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev):
+ self.repo = repo
+ self.drop_caches = drop_caches
+ self.tests = tests
+ self.testlogs = testlogs
+ self.basebranch = basebranch
+ self.baserev = baserev
+ self.singletest = None
+ self.revision = None
+ self.branch = 'master' # Default branch
+
+ def additional_args(self, singletest, revision, branch):
+ """Additional configuration from command line arguments"""
+ self.singletest = singletest
+ if revision is not None:
+ self.revision = revision
+ if branch is not None:
+ self.branch = branch
+
+ def set_revision(self, revision):
+ """Sets the current revision that is being tested"""
+ self.revision = revision
+
+
+class Test:
+ """A simple class to contain all information about tests"""
+ def __init__(self, name, command, ldopts, permutations):
+ self.name = name
+ self.command = command
+ self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet
+ self.permutations = permutations
+
+def parse_configfile(conffile, testdir, moses_repo):
+ """Parses the config file"""
+ command, ldopts = '', ''
+ permutations = []
+ fileopen = open(conffile, 'r')
+ for line in fileopen:
+ line = line.split('#')[0] # Discard comments
+ if line == '' or line == '\n':
+ continue # Discard lines with comments only and empty lines
+ opt, args = line.split(' ', 1) # Get arguments
+
+ if opt == 'Command:':
+ command = args.replace('\n', '')
+ command = moses_repo + '/bin/' + command
+ elif opt == 'LDPRE:':
+ ldopts = args.replace('\n', '')
+ elif opt == 'Variants:':
+ permutations = args.replace('\n', '').replace(' ', '').split(',')
+ else:
+ raise ValueError('Unrecognized option ' + opt)
+ #We use the testdir as the name.
+ testcase = Test(testdir, command, ldopts, permutations)
+ fileopen.close()
+ return testcase
+
+def parse_testconfig(conffile):
+ """Parses the config file for the whole testsuite."""
+ repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', ''
+ basebranch, baserev = '', ''
+ fileopen = open(conffile, 'r')
+ for line in fileopen:
+ line = line.split('#')[0] # Discard comments
+ if line == '' or line == '\n':
+ continue # Discard lines with comments only and empty lines
+ opt, args = line.split(' ', 1) # Get arguments
+ if opt == 'MOSES_REPO_PATH:':
+ repo_path = args.replace('\n', '')
+ elif opt == 'DROP_CACHES_COMM:':
+ drop_caches = args.replace('\n', '')
+ elif opt == 'TEST_DIR:':
+ tests_dir = args.replace('\n', '')
+ elif opt == 'TEST_LOG_DIR:':
+ testlog_dir = args.replace('\n', '')
+ elif opt == 'BASEBRANCH:':
+ basebranch = args.replace('\n', '')
+ elif opt == 'BASEREV:':
+ baserev = args.replace('\n', '')
+ else:
+ raise ValueError('Unrecognized option ' + opt)
+ config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\
+ basebranch, baserev)
+ fileopen.close()
+ return config
+
+def get_config():
+ """Builds the config object with all necessary attributes"""
+ args = parse_cmd()
+ config = parse_testconfig(args.configfile)
+ config.additional_args(args.singletestdir, args.revision, args.branch)
+ revision = repoinit(config)
+ config.set_revision(revision)
+ return config
+
+def check_for_basever(testlogfile, basebranch):
+ """Checks if the base revision is present in the testlogs"""
+ filetoopen = open(testlogfile, 'r')
+ for line in filetoopen:
+ templine = processLogLine(line)
+ if templine.branch == basebranch:
+ return True
+ return False
+
+def split_time(filename):
+ """Splits the output of the time function into seperate parts.
+ We will write time to file, because many programs output to
+ stderr which makes it difficult to get only the exact results we need."""
+ timefile = open(filename, 'r')
+ realtime = float(timefile.readline().replace('\n', '').split()[1])
+ usertime = float(timefile.readline().replace('\n', '').split()[1])
+ systime = float(timefile.readline().replace('\n', '').split()[1])
+ timefile.close()
+
+ return (realtime, usertime, systime)
+
+
+def write_log(time_file, logname, config):
+ """Writes to a logfile"""
+ log_write = open(config.testlogs + '/' + logname, 'a') # Open logfile
+ date_run = time.strftime("%d.%m.%Y %H:%M:%S") # Get the time of the test
+ realtime, usertime, systime = split_time(time_file) # Get the times in a nice form
+
+ # Append everything to a log file.
+ writestr = date_run + " " + config.revision + " Testname: " + logname +\
+ " RealTime: " + str(realtime) + " UserTime: " + str(usertime) +\
+ " SystemTime: " + str(systime) + " Branch: " + config.branch +'\n'
+ log_write.write(writestr)
+ log_write.close()
+
+
+def execute_tests(testcase, cur_directory, config):
+ """Executes timed tests based on the config file"""
+ #Figure out the order of which tests must be executed.
+ #Change to the current test directory
+ os.chdir(config.tests + '/' + cur_directory)
+ #Clear caches
+ subprocess.call(['sync'], shell=True)
+ subprocess.call([config.drop_caches], shell=True)
+ #Perform vanilla test and if a cached test exists - as well
+ print(testcase.name)
+ if 'vanilla' in testcase.permutations:
+ print(testcase.command)
+ subprocess.Popen(['time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\
+ stderr=subprocess.PIPE, shell=True).communicate()
+ write_log('/tmp/time_moses_tests', testcase.name + '_vanilla', config)
+ if 'cached' in testcase.permutations:
+ subprocess.Popen(['time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\
+ stderr=None, shell=True).communicate()
+ write_log('/tmp/time_moses_tests', testcase.name + '_vanilla_cached', config)
+
+ #Now perform LD_PRELOAD tests
+ if 'ldpre' in testcase.permutations:
+ for opt in testcase.ldopts:
+ #Clear caches
+ subprocess.call(['sync'], shell=True)
+ subprocess.call([config.drop_caches], shell=True)
+
+ #test
+ subprocess.Popen(['LD_PRELOAD ' + opt + ' time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\
+ stderr=None, shell=True).communicate()
+ write_log('/tmp/time_moses_tests', testcase.name + '_ldpre_' + opt, config)
+ if 'cached' in testcase.permutations:
+ subprocess.Popen(['LD_PRELOAD ' + opt + ' time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\
+ stderr=None, shell=True).communicate()
+ write_log('/tmp/time_moses_tests', testcase.name + '_ldpre_' +opt +'_cached', config)
+
+# Go through all the test directories and executes tests
+if __name__ == '__main__':
+ CONFIG = get_config()
+ ALL_DIR = os.listdir(CONFIG.tests)
+
+ #We should first check if any of the tests is run for the first time.
+ #If some of them are run for the first time we should first get their
+ #time with the base version (usually the previous release)
+ FIRSTTIME = []
+ TESTLOGS = []
+ #Strip filenames of test underscores
+ for listline in os.listdir(CONFIG.testlogs):
+ listline = listline.replace('_vanilla', '')
+ listline = listline.replace('_cached', '')
+ listline = listline.replace('_ldpre', '')
+ TESTLOGS.append(listline)
+ for directory in ALL_DIR:
+ if directory not in TESTLOGS:
+ FIRSTTIME.append(directory)
+
+ #Sometimes even though we have the log files, we will need to rerun them
+ #Against a base version, because we require a different baseversion (for
+ #example when a new version of Moses is released.) Therefore we should
+ #Check if the version of Moses that we have as a base version is in all
+ #of the log files.
+
+ for logfile in os.listdir(CONFIG.testlogs):
+ logfile_name = CONFIG.testlogs + '/' + logfile
+ if not check_for_basever(logfile_name, CONFIG.basebranch):
+ logfile = logfile.replace('_vanilla', '')
+ logfile = logfile.replace('_cached', '')
+ logfile = logfile.replace('_ldpre', '')
+ FIRSTTIME.append(logfile)
+ FIRSTTIME = list(set(FIRSTTIME)) #Deduplicate
+
+ if FIRSTTIME != []:
+ #Create a new configuration for base version tests:
+ BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\
+ CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\
+ CONFIG.baserev)
+ BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch)
+ #Set up the repository and get its revision:
+ REVISION = repoinit(BASECONFIG)
+ BASECONFIG.set_revision(REVISION)
+ #Build
+ os.chdir(BASECONFIG.repo)
+ subprocess.call(['./previous.sh'], shell=True)
+
+ #Perform tests
+ for directory in FIRSTTIME:
+ cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\
+ '/config', directory, BASECONFIG.repo)
+ execute_tests(cur_testcase, directory, BASECONFIG)
+
+ #Reset back the repository to the normal configuration
+ repoinit(CONFIG)
+
+ #Builds moses
+ os.chdir(CONFIG.repo)
+ subprocess.call(['./previous.sh'], shell=True)
+
+ if CONFIG.singletest:
+ TESTCASE = parse_configfile(CONFIG.tests + '/' +\
+ CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo)
+ execute_tests(TESTCASE, CONFIG.singletest, CONFIG)
+ else:
+ for directory in ALL_DIR:
+ cur_testcase = parse_configfile(CONFIG.tests + '/' + directory +\
+ '/config', directory, CONFIG.repo)
+ execute_tests(cur_testcase, directory, CONFIG)
diff --git a/contrib/moses-speedtest/sys_drop_caches.py b/contrib/moses-speedtest/sys_drop_caches.py
new file mode 100644
index 000000000..d4796e090
--- /dev/null
+++ b/contrib/moses-speedtest/sys_drop_caches.py
@@ -0,0 +1,22 @@
+#!/usr/bin/spython
+from sys import argv, stderr, exit
+from os import linesep as ls
+procfile = "/proc/sys/vm/drop_caches"
+options = ["1","2","3"]
+flush_type = None
+try:
+ flush_type = argv[1][0:1]
+ if not flush_type in options:
+ raise IndexError, "not in options"
+ with open(procfile, "w") as f:
+ f.write("%s%s" % (flush_type,ls))
+ exit(0)
+except IndexError, e:
+ stderr.write("Argument %s required.%s" % (options, ls))
+except IOError, e:
+ stderr.write("Error writing to file.%s" % ls)
+except StandardError, e:
+ stderr.write("Unknown Error.%s" % ls)
+
+exit(1)
+
diff --git a/contrib/moses-speedtest/test_config b/contrib/moses-speedtest/test_config
new file mode 100644
index 000000000..4a480f496
--- /dev/null
+++ b/contrib/moses-speedtest/test_config
@@ -0,0 +1,3 @@
+Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
+LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/,
+Variants: vanilla, cached, ldpre #Can't have cached without ldpre or vanilla
diff --git a/contrib/moses-speedtest/testsuite_common.py b/contrib/moses-speedtest/testsuite_common.py
new file mode 100644
index 000000000..be96f98b5
--- /dev/null
+++ b/contrib/moses-speedtest/testsuite_common.py
@@ -0,0 +1,54 @@
+"""Common functions of the testsuitce"""
+import os
+#Clour constants
+class bcolors:
+ PURPLE = '\033[95m'
+ BLUE = '\033[94m'
+ GREEN = '\033[92m'
+ YELLOW = '\033[93m'
+ RED = '\033[91m'
+ ENDC = '\033[0m'
+
+class LogLine:
+ """A class to contain logfile line"""
+ def __init__(self, date, time, revision, testname, real, user, system, branch):
+ self.date = date
+ self.time = time
+ self.revision = revision
+ self.testname = testname
+ self.real = real
+ self.system = system
+ self.user = user
+ self.branch = branch
+
+class Result:
+ """A class to contain results of benchmarking"""
+ def __init__(self, testname, previous, current, revision, branch, prevrev, prevbranch):
+ self.testname = testname
+ self.previous = previous
+ self.current = current
+ self.change = previous - current
+ self.revision = revision
+ self.branch = branch
+ self.prevbranch = prevbranch
+ self.prevrev = prevrev
+ #Produce a percentage with fewer digits
+ self.percentage = float(format(1 - current/previous, '.4f'))
+
+def processLogLine(logline):
+ """Parses the log line into a nice datastructure"""
+ logline = logline.split()
+ log = LogLine(logline[0], logline[1], logline[2], logline[4],\
+ float(logline[6]), float(logline[8]), float(logline[10]), logline[12])
+ return log
+
+def getLastTwoLines(filename, logdir):
+ """Just a call to tail to get the diff between the last two runs"""
+ try:
+ line1, line2 = os.popen("tail -n2 " + logdir + '/' + filename)
+ except ValueError: #Check for new tests
+ tempfile = open(logdir + '/' + filename)
+ line1 = tempfile.readline()
+ tempfile.close()
+ return (line1, '\n')
+ return (line1, line2)
diff --git a/contrib/moses-speedtest/testsuite_config b/contrib/moses-speedtest/testsuite_config
new file mode 100644
index 000000000..b6ad6181c
--- /dev/null
+++ b/contrib/moses-speedtest/testsuite_config
@@ -0,0 +1,5 @@
+MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder
+DROP_CACHES_COMM: sys_drop_caches 3
+TEST_DIR: /home/moses-speedtest/phrase_tables/tests
+TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
+BASEBRANCH: RELEASE-2.1.1 \ No newline at end of file
diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/CreateOnDiskPt/.cproject
index 86dfbac5b..e5082178a 100644
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/CreateOnDiskPt/.cproject
@@ -1,101 +1,76 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.162355801">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.162355801" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.602770742">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.602770742" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.162355801" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
- <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
- <tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
- <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1584931166" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.65842083" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
- <listOptionValue builtIn="false" value="/opt/local/include/"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../cmph/include&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.602770742" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.602770742." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1436139469" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.622899770" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/CreateOnDiskPt}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1448999623" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.2139008298" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2008193341" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.627728792" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1832148270" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.1681469807" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
- <option id="gnu.cpp.compiler.option.preprocessor.def.1785368241" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
- <listOptionValue builtIn="false" value="HAVE_BOOST"/>
- <listOptionValue builtIn="false" value="TRACE_ENABLE"/>
- <listOptionValue builtIn="false" value="KENLM_MAX_ORDER=7"/>
- <listOptionValue builtIn="false" value="WITH_THREADS"/>
+ <option id="gnu.cpp.compiler.option.preprocessor.def.425758466" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="MAX_NUM_FACTORS=4"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1402496521" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.285185442" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.827478809" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.1840610682" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.exe.debug.option.debugging.level.1437095112" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.128236233" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.587301391" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.2116328611" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.2129089003" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1464765114" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.755343734" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
- <option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../nplm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../DALM/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../cmph/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../randlm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/OnDiskPt/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/search/Debug&quot;"/>
- <listOptionValue builtIn="false" value="/opt/local/lib"/>
- </option>
- <option id="gnu.cpp.link.option.libs.1177721357" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.606542044" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.813817495" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.libs.1325292383" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <listOptionValue builtIn="false" value="OnDiskPt"/>
<listOptionValue builtIn="false" value="moses"/>
- <listOptionValue builtIn="false" value="irstlm"/>
- <listOptionValue builtIn="false" value="dstruct"/>
- <listOptionValue builtIn="false" value="dalm"/>
- <listOptionValue builtIn="false" value="flm"/>
- <listOptionValue builtIn="false" value="oolm"/>
- <listOptionValue builtIn="false" value="lattice"/>
- <listOptionValue builtIn="false" value="misc"/>
<listOptionValue builtIn="false" value="search"/>
- <listOptionValue builtIn="false" value="RandLM"/>
- <listOptionValue builtIn="false" value="OnDiskPt"/>
<listOptionValue builtIn="false" value="lm"/>
<listOptionValue builtIn="false" value="util"/>
- <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
- <listOptionValue builtIn="false" value="boost_system-mt"/>
- <listOptionValue builtIn="false" value="boost_thread-mt"/>
- <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
+ <listOptionValue builtIn="false" value="boost_serialization"/>
+ <listOptionValue builtIn="false" value="boost_system"/>
+ <listOptionValue builtIn="false" value="boost_thread"/>
+ <listOptionValue builtIn="false" value="boost_filesystem"/>
<listOptionValue builtIn="false" value="pthread"/>
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="bz2"/>
<listOptionValue builtIn="false" value="dl"/>
<listOptionValue builtIn="false" value="rt"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.128214028" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <option id="gnu.cpp.link.option.paths.815001500" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/search/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/OnDiskPt/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
+ </option>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2077999464" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1267270542" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.612723114" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1309273058" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1393504995" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -103,45 +78,44 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.516628324">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.516628324" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.168814843">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.168814843" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.516628324" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.exe.release.516628324." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1782680519" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
- <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.release.587667692" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
- <builder buildPath="${workspace_loc:/moses-chart-cmd/Release}" id="cdt.managedbuild.target.gnu.builder.exe.release.330540300" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
- <tool id="cdt.managedbuild.tool.gnu.archiver.base.1062976385" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1344864210" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
- <option id="gnu.cpp.compiler.exe.release.option.optimization.level.1422341509" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.exe.release.option.debugging.level.1573362644" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1937178483" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.168814843" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.168814843." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.844577457" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1635721038" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/CreateOnDiskPt}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.361379130" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.799410017" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1404799808" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.696270987" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.1052942304" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2139553528" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1116405938" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.32856289" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.exe.release.option.debugging.level.1235489953" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1583852187" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1633770352" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1936692829" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.2077864052" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1045097629" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.1007421110" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.195880914" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.518921609" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.455462639" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.868037913" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1997666824" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.330494310" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1407747418" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.552535001" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.546084937" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -151,32 +125,30 @@
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="moses-chart-cmd.cdt.managedbuild.target.gnu.exe.532411209" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
+ <project id="CreateOnDiskPt.cdt.managedbuild.target.gnu.exe.348559778" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.516628324;cdt.managedbuild.config.gnu.exe.release.516628324.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1116405938;cdt.managedbuild.tool.gnu.c.compiler.input.1583852187">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.168814843;cdt.managedbuild.config.gnu.exe.release.168814843.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1404799808;cdt.managedbuild.tool.gnu.cpp.compiler.input.2139553528">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.162355801;cdt.managedbuild.config.gnu.exe.debug.162355801.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.827478809;cdt.managedbuild.tool.gnu.c.compiler.input.128236233">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.602770742;cdt.managedbuild.config.gnu.exe.debug.602770742.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.587301391;cdt.managedbuild.tool.gnu.c.compiler.input.1464765114">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.162355801;cdt.managedbuild.config.gnu.exe.debug.162355801.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480;cdt.managedbuild.tool.gnu.cpp.compiler.input.1402496521">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.168814843;cdt.managedbuild.config.gnu.exe.release.168814843.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1633770352;cdt.managedbuild.tool.gnu.c.compiler.input.1045097629">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.516628324;cdt.managedbuild.config.gnu.exe.release.516628324.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1344864210;cdt.managedbuild.tool.gnu.cpp.compiler.input.1937178483">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.602770742;cdt.managedbuild.config.gnu.exe.debug.602770742.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2008193341;cdt.managedbuild.tool.gnu.cpp.compiler.input.285185442">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
- <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+ <resource resourceType="PROJECT" workspacePath="/CreateOnDiskPt"/>
</configuration>
<configuration configurationName="Debug">
- <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+ <resource resourceType="PROJECT" workspacePath="/CreateOnDiskPt"/>
</configuration>
</storageModule>
- <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
- <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>
diff --git a/contrib/other-builds/CreateOnDiskPt/.project b/contrib/other-builds/CreateOnDiskPt/.project
new file mode 100644
index 000000000..5bca3b8f2
--- /dev/null
+++ b/contrib/other-builds/CreateOnDiskPt/.project
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>CreateOnDiskPt</name>
+ <comment></comment>
+ <projects>
+ <project>lm</project>
+ <project>moses</project>
+ <project>OnDiskPt</project>
+ <project>search</project>
+ <project>util</project>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+ <triggers>clean,full,incremental,</triggers>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ <buildCommand>
+ <name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+ <triggers>full,incremental,</triggers>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.cdt.core.cnature</nature>
+ <nature>org.eclipse.cdt.core.ccnature</nature>
+ <nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+ <nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+ </natures>
+ <linkedResources>
+ <link>
+ <name>Main.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/OnDiskPt/Main.cpp</locationURI>
+ </link>
+ <link>
+ <name>Main.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/OnDiskPt/Main.h</locationURI>
+ </link>
+ </linkedResources>
+</projectDescription>
diff --git a/contrib/other-builds/consolidate/.cproject b/contrib/other-builds/consolidate/.cproject
new file mode 100644
index 000000000..4593957dc
--- /dev/null
+++ b/contrib/other-builds/consolidate/.cproject
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+ <storageModule moduleId="org.eclipse.cdt.core.settings">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.2091728208">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2091728208" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <externalSettings/>
+ <extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ </extensions>
+ </storageModule>
+ <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2091728208" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2091728208." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.69362991" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.641760346" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/consolidate}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1286696537" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1571215005" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1186248186" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1416850495" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.534201039" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/include&quot;"/>
+ </option>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1468157552" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.82249493" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.83105790" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.937329669" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.461173729" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1950007837" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.110628197" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.libs.1393924562" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <listOptionValue builtIn="false" value="moses"/>
+ <listOptionValue builtIn="false" value="search"/>
+ <listOptionValue builtIn="false" value="OnDiskPt"/>
+ <listOptionValue builtIn="false" value="lm"/>
+ <listOptionValue builtIn="false" value="util"/>
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
+ <listOptionValue builtIn="false" value="boost_serialization"/>
+ <listOptionValue builtIn="false" value="boost_system"/>
+ <listOptionValue builtIn="false" value="boost_thread"/>
+ <listOptionValue builtIn="false" value="boost_filesystem"/>
+ <listOptionValue builtIn="false" value="pthread"/>
+ <listOptionValue builtIn="false" value="z"/>
+ <listOptionValue builtIn="false" value="dl"/>
+ <listOptionValue builtIn="false" value="rt"/>
+ </option>
+ <option id="gnu.cpp.link.option.paths.1967422094" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/search/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/OnDiskPt/Debug&quot;"/>
+ </option>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1093223502" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+ <additionalInput kind="additionalinput" paths="$(LIBS)"/>
+ </inputType>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1334927727" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.197989377" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ </tool>
+ </toolChain>
+ </folderInfo>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.2091728208.911524129" name="PropertiesConsolidator.cpp" rcbsApplicability="disable" resourcePath="PropertiesConsolidator.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654.741737356">
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654.741737356" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654">
+ <option id="gnu.cpp.compiler.option.include.paths.858416673" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/include&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
+ </option>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2042647079" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ </tool>
+ </fileInfo>
+ </configuration>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+ </cconfiguration>
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.185559773">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.185559773" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <externalSettings/>
+ <extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ </extensions>
+ </storageModule>
+ <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.185559773" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.185559773." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.33298530" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1524270442" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/consolidate}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.1812036307" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1942293389" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.520681695" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.649091161" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.1279967053" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.624630717" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.233526141" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1882834640" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.1438334736" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1338220126" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.2105674082" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.1531731895" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.286541559" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+ <additionalInput kind="additionalinput" paths="$(LIBS)"/>
+ </inputType>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.1075374533" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.231041028" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ </tool>
+ </toolChain>
+ </folderInfo>
+ </configuration>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+ </cconfiguration>
+ </storageModule>
+ <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+ <project id="consolidate.cdt.managedbuild.target.gnu.exe.1024637209" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
+ </storageModule>
+ <storageModule moduleId="scannerConfiguration">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2091728208;cdt.managedbuild.config.gnu.exe.debug.2091728208.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654;cdt.managedbuild.tool.gnu.cpp.compiler.input.1468157552">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2091728208;cdt.managedbuild.config.gnu.exe.debug.2091728208.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.82249493;cdt.managedbuild.tool.gnu.c.compiler.input.461173729">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.185559773;cdt.managedbuild.config.gnu.exe.release.185559773.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.233526141;cdt.managedbuild.tool.gnu.c.compiler.input.1338220126">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.185559773;cdt.managedbuild.config.gnu.exe.release.185559773.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.520681695;cdt.managedbuild.tool.gnu.cpp.compiler.input.624630717">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/consolidate"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/consolidate"/>
+ </configuration>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+</cproject>
diff --git a/contrib/other-builds/extract-ordering/.project b/contrib/other-builds/consolidate/.project
index f95b064b7..db9a1fa8f 100644
--- a/contrib/other-builds/extract-ordering/.project
+++ b/contrib/other-builds/consolidate/.project
@@ -1,8 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
- <name>extract-ordering</name>
+ <name>consolidate</name>
<comment></comment>
<projects>
+ <project>lm</project>
+ <project>moses</project>
+ <project>OnDiskPt</project>
+ <project>search</project>
+ <project>util</project>
</projects>
<buildSpec>
<buildCommand>
@@ -46,19 +51,19 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
</link>
<link>
- <name>SentenceAlignment.cpp</name>
+ <name>PropertiesConsolidator.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/PropertiesConsolidator.cpp</locationURI>
</link>
<link>
- <name>SentenceAlignment.h</name>
+ <name>PropertiesConsolidator.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/PropertiesConsolidator.h</locationURI>
</link>
<link>
- <name>extract-ordering-main.cpp</name>
+ <name>consolidate-main.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ordering-main.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/consolidate-main.cpp</locationURI>
</link>
<link>
<name>tables-core.cpp</name>
diff --git a/contrib/other-builds/extract-ghkm/.cproject b/contrib/other-builds/extract-ghkm/.cproject
index 61ea19161..a567905ee 100644
--- a/contrib/other-builds/extract-ghkm/.cproject
+++ b/contrib/other-builds/extract-ghkm/.cproject
@@ -1,59 +1,54 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1410559002">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1410559002" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.1975272196">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1975272196" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1410559002" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1410559002." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.1035891586" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.242178856" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract-ghkm/Debug}" id="cdt.managedbuild.builder.gnu.cross.430400318" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.251687262" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.962699619" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.230503798" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.433137197" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.367822268" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.971749711" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.984190691" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.1374841264" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../phrase-extract&quot;"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.1975272196" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1975272196." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1513645956" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.621141597" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/extract-ghkm}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1641243676" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.150240237" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.494510261" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.520735766" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.730994342" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.1461708548" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/include&quot;"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2075381818" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1669405610" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1026620601" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1419857560" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <option id="gnu.cpp.link.option.paths.668926503" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/lib64&quot;"/>
- </option>
- <option id="gnu.cpp.link.option.libs.2091468346" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
- <listOptionValue builtIn="false" value="boost_program_options-mt"/>
- <listOptionValue builtIn="false" value="boost_thread-mt"/>
- <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
- <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.849972124" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.154971011" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.600284918" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2129236570" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1041890522" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.674199351" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.libs.1221354875" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
<listOptionValue builtIn="false" value="z"/>
- <listOptionValue builtIn="false" value="bz2"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1684298294" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <option id="gnu.cpp.link.option.paths.1494157787" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
+ </option>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1468265945" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.320160974" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.2021657841" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1689419664" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.882941613" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.387904024" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -61,44 +56,44 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.1825927494">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1825927494" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.1834059581">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1834059581" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.1825927494" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.1825927494." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.2000920404" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1106451881" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract-ghkm/Release}" id="cdt.managedbuild.builder.gnu.cross.727887705" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.819016498" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1057468997" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.1130475273" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.164617278" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1834059581" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.1834059581." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.154645030" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.483189041" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/extract-ghkm}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.882065438" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1816735709" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.788831102" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.1367749352" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.1361465069" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.162097682" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1312144641" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.406333630" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.1059243022" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1204977083" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.394449415" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.573463904" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.361552728" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.769108402" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1068655225" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1213865062" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.764325642" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.1636823200" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.1458872383" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.961080011" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.1299258961" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.896866692" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.276294580" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.1285290074" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1686210477" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -108,21 +103,21 @@
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="extract-ghkm.cdt.managedbuild.target.gnu.cross.exe.1830080171" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+ <project id="extract-ghkm.cdt.managedbuild.target.gnu.exe.283582370" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1825927494;cdt.managedbuild.config.gnu.cross.exe.release.1825927494.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1312144641;cdt.managedbuild.tool.gnu.cpp.compiler.input.1204977083">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1975272196;cdt.managedbuild.config.gnu.exe.debug.1975272196.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.494510261;cdt.managedbuild.tool.gnu.cpp.compiler.input.1669405610">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1410559002;cdt.managedbuild.config.gnu.cross.exe.debug.1410559002.;cdt.managedbuild.tool.gnu.cross.c.compiler.251687262;cdt.managedbuild.tool.gnu.c.compiler.input.433137197">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1834059581;cdt.managedbuild.config.gnu.exe.release.1834059581.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.394449415;cdt.managedbuild.tool.gnu.c.compiler.input.769108402">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1825927494;cdt.managedbuild.config.gnu.cross.exe.release.1825927494.;cdt.managedbuild.tool.gnu.cross.c.compiler.819016498;cdt.managedbuild.tool.gnu.c.compiler.input.164617278">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1975272196;cdt.managedbuild.config.gnu.exe.debug.1975272196.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.849972124;cdt.managedbuild.tool.gnu.c.compiler.input.2129236570">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1410559002;cdt.managedbuild.config.gnu.cross.exe.debug.1410559002.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.367822268;cdt.managedbuild.tool.gnu.cpp.compiler.input.2075381818">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1834059581;cdt.managedbuild.config.gnu.exe.release.1834059581.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.788831102;cdt.managedbuild.tool.gnu.cpp.compiler.input.162097682">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
@@ -134,4 +129,5 @@
<resource resourceType="PROJECT" workspacePath="/extract-ghkm"/>
</configuration>
</storageModule>
+ <storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>
diff --git a/contrib/other-builds/extract-ghkm/.project b/contrib/other-builds/extract-ghkm/.project
index b7c40f069..d4c8fe860 100644
--- a/contrib/other-builds/extract-ghkm/.project
+++ b/contrib/other-builds/extract-ghkm/.project
@@ -36,39 +36,19 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Alignment.h</locationURI>
</link>
<link>
- <name>AlignmentGraph.cpp</name>
+ <name>Hole.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/AlignmentGraph.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/Hole.h</locationURI>
</link>
<link>
- <name>AlignmentGraph.h</name>
+ <name>HoleCollection.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/AlignmentGraph.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/HoleCollection.cpp</locationURI>
</link>
<link>
- <name>ComposedRule.cpp</name>
+ <name>HoleCollection.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ComposedRule.cpp</locationURI>
- </link>
- <link>
- <name>ComposedRule.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ComposedRule.h</locationURI>
- </link>
- <link>
- <name>Exception.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Exception.h</locationURI>
- </link>
- <link>
- <name>ExtractGHKM.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ExtractGHKM.cpp</locationURI>
- </link>
- <link>
- <name>ExtractGHKM.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ExtractGHKM.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/HoleCollection.h</locationURI>
</link>
<link>
<name>InputFileStream.cpp</name>
@@ -81,31 +61,6 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.h</locationURI>
</link>
<link>
- <name>Jamfile</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Jamfile</locationURI>
- </link>
- <link>
- <name>Main.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Main.cpp</locationURI>
- </link>
- <link>
- <name>Node.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Node.cpp</locationURI>
- </link>
- <link>
- <name>Node.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Node.h</locationURI>
- </link>
- <link>
- <name>Options.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Options.h</locationURI>
- </link>
- <link>
<name>OutputFileStream.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.cpp</locationURI>
@@ -116,54 +71,34 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
</link>
<link>
- <name>ParseTree.cpp</name>
+ <name>PhraseOrientation.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ParseTree.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.cpp</locationURI>
</link>
<link>
- <name>ParseTree.h</name>
+ <name>PhraseOrientation.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ParseTree.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.h</locationURI>
</link>
<link>
- <name>ScfgRule.cpp</name>
+ <name>SentenceAlignment.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ScfgRule.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.cpp</locationURI>
</link>
<link>
- <name>ScfgRule.h</name>
+ <name>SentenceAlignment.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ScfgRule.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.h</locationURI>
</link>
<link>
- <name>ScfgRuleWriter.cpp</name>
+ <name>SentenceAlignmentWithSyntax.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.cpp</locationURI>
</link>
<link>
- <name>ScfgRuleWriter.h</name>
+ <name>SentenceAlignmentWithSyntax.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/ScfgRuleWriter.h</locationURI>
- </link>
- <link>
- <name>Span.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Span.cpp</locationURI>
- </link>
- <link>
- <name>Span.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Span.h</locationURI>
- </link>
- <link>
- <name>Subgraph.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Subgraph.cpp</locationURI>
- </link>
- <link>
- <name>Subgraph.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Subgraph.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.h</locationURI>
</link>
<link>
<name>SyntaxTree.cpp</name>
@@ -186,14 +121,9 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/XmlTree.h</locationURI>
</link>
<link>
- <name>XmlTreeParser.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/XmlTreeParser.cpp</locationURI>
- </link>
- <link>
- <name>XmlTreeParser.h</name>
+ <name>extract-rules-main.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/XmlTreeParser.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-rules-main.cpp</locationURI>
</link>
<link>
<name>tables-core.cpp</name>
diff --git a/contrib/other-builds/extract-mixed-syntax/.cproject b/contrib/other-builds/extract-mixed-syntax/.cproject
index 1cc09dda3..f246b0c32 100644
--- a/contrib/other-builds/extract-mixed-syntax/.cproject
+++ b/contrib/other-builds/extract-mixed-syntax/.cproject
@@ -1,54 +1,59 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.1409305044">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1409305044" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.456080129" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.582801917" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract-mixed-syntax/Debug}" id="cdt.managedbuild.builder.gnu.cross.1220166455" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.2055012191" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.1768196213" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2007889843" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.855436310" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.506549229" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.1497326561" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.1409305044" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1409305044." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1388217813" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.933039924" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/extract-mixed-syntax}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.48110463" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.98916974" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1188224255" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.391351501" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1590628643" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.968781133" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/include&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../..&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../phrase-extract&quot;"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1981472807" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.606353571" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.740521305" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <option id="gnu.cpp.link.option.libs.1946120010" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.902271411" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.736647824" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.2105683691" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1947641767" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.966210211" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1701471219" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.libs.1906832553" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <listOptionValue builtIn="false" value="util"/>
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
+ <listOptionValue builtIn="false" value="boost_program_options"/>
<listOptionValue builtIn="false" value="z"/>
- <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
</option>
- <option id="gnu.cpp.link.option.paths.1563475751" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <option id="gnu.cpp.link.option.paths.1107413288" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.106010037" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1613608534" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.136661991" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.2112208574" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.172930211" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1191140458" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.257834788" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -56,44 +61,44 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.715007893">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.1529383679">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1529383679" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.715007893." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.99436307" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.801178939" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract-mixed-syntax/Release}" id="cdt.managedbuild.builder.gnu.cross.1999547547" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1481537766" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.1967527847" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.442342681" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1529383679" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.1529383679." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1048718406" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.456212753" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/extract-mixed-syntax}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.1570266419" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.577209301" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1943090599" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.1506916262" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.2132167444" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.619145487" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.1847950300" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.1130138972" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.2063838952" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.391536740" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.147725572" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1423330814" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.158429528" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.2020667840" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1372779734" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.1089231126" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.1386796864" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1793802493" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.371006952" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.1770045040" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.707592414" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.1864177991" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.2122644096" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -103,22 +108,34 @@
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="extract-mixed-syntax.cdt.managedbuild.target.gnu.cross.exe.1868010260" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+ <project id="extract-mixed-syntax.cdt.managedbuild.target.gnu.exe.1077520702" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038;cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1409305044;cdt.managedbuild.config.gnu.exe.debug.1409305044.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1188224255;cdt.managedbuild.tool.gnu.cpp.compiler.input.1981472807">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906;cdt.managedbuild.tool.gnu.c.compiler.input.442342681">
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.668933542;cdt.managedbuild.config.gnu.cross.exe.release.668933542.;cdt.managedbuild.tool.gnu.cross.c.compiler.1457475056;cdt.managedbuild.tool.gnu.c.compiler.input.90570918">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915;cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064">
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1529383679;cdt.managedbuild.config.gnu.exe.release.1529383679.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1943090599;cdt.managedbuild.tool.gnu.cpp.compiler.input.619145487">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.717781750;cdt.managedbuild.config.gnu.cross.exe.debug.717781750.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.370220943;cdt.managedbuild.tool.gnu.cpp.compiler.input.1392992841">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568;cdt.managedbuild.tool.gnu.c.compiler.input.2007889843">
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1409305044;cdt.managedbuild.config.gnu.exe.debug.1409305044.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.902271411;cdt.managedbuild.tool.gnu.c.compiler.input.1947641767">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.717781750;cdt.managedbuild.config.gnu.cross.exe.debug.717781750.;cdt.managedbuild.tool.gnu.cross.c.compiler.843537319;cdt.managedbuild.tool.gnu.c.compiler.input.1750960939">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.668933542;cdt.managedbuild.config.gnu.cross.exe.release.668933542.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.648756325;cdt.managedbuild.tool.gnu.cpp.compiler.input.1840233144">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ </scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1529383679;cdt.managedbuild.config.gnu.exe.release.1529383679.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.2063838952;cdt.managedbuild.tool.gnu.c.compiler.input.1423330814">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
@@ -129,5 +146,4 @@
<resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
</configuration>
</storageModule>
- <storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>
diff --git a/contrib/other-builds/extract-mixed-syntax/.project b/contrib/other-builds/extract-mixed-syntax/.project
index 8f0f81f07..7fdbadabb 100644
--- a/contrib/other-builds/extract-mixed-syntax/.project
+++ b/contrib/other-builds/extract-mixed-syntax/.project
@@ -3,6 +3,7 @@
<name>extract-mixed-syntax</name>
<comment></comment>
<projects>
+ <project>util</project>
</projects>
<buildSpec>
<buildCommand>
@@ -24,4 +25,196 @@
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
+ <linkedResources>
+ <link>
+ <name>AlignedSentence.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/AlignedSentence.cpp</locationURI>
+ </link>
+ <link>
+ <name>AlignedSentence.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/AlignedSentence.h</locationURI>
+ </link>
+ <link>
+ <name>AlignedSentenceSyntax.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp</locationURI>
+ </link>
+ <link>
+ <name>AlignedSentenceSyntax.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.h</locationURI>
+ </link>
+ <link>
+ <name>ConsistentPhrase.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/ConsistentPhrase.cpp</locationURI>
+ </link>
+ <link>
+ <name>ConsistentPhrase.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/ConsistentPhrase.h</locationURI>
+ </link>
+ <link>
+ <name>ConsistentPhrases.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp</locationURI>
+ </link>
+ <link>
+ <name>ConsistentPhrases.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/ConsistentPhrases.h</locationURI>
+ </link>
+ <link>
+ <name>InputFileStream.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/InputFileStream.cpp</locationURI>
+ </link>
+ <link>
+ <name>InputFileStream.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/InputFileStream.h</locationURI>
+ </link>
+ <link>
+ <name>Jamfile</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Jamfile</locationURI>
+ </link>
+ <link>
+ <name>Main.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Main.cpp</locationURI>
+ </link>
+ <link>
+ <name>Main.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Main.h</locationURI>
+ </link>
+ <link>
+ <name>Makefile</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Makefile</locationURI>
+ </link>
+ <link>
+ <name>NonTerm.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/NonTerm.cpp</locationURI>
+ </link>
+ <link>
+ <name>NonTerm.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/NonTerm.h</locationURI>
+ </link>
+ <link>
+ <name>OutputFileStream.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.cpp</locationURI>
+ </link>
+ <link>
+ <name>OutputFileStream.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
+ </link>
+ <link>
+ <name>Parameter.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Parameter.cpp</locationURI>
+ </link>
+ <link>
+ <name>Parameter.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Parameter.h</locationURI>
+ </link>
+ <link>
+ <name>Phrase.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Phrase.cpp</locationURI>
+ </link>
+ <link>
+ <name>Phrase.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Phrase.h</locationURI>
+ </link>
+ <link>
+ <name>Rule.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Rule.cpp</locationURI>
+ </link>
+ <link>
+ <name>Rule.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Rule.h</locationURI>
+ </link>
+ <link>
+ <name>RulePhrase.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/RulePhrase.cpp</locationURI>
+ </link>
+ <link>
+ <name>RulePhrase.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/RulePhrase.h</locationURI>
+ </link>
+ <link>
+ <name>RuleSymbol.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/RuleSymbol.cpp</locationURI>
+ </link>
+ <link>
+ <name>RuleSymbol.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/RuleSymbol.h</locationURI>
+ </link>
+ <link>
+ <name>Rules.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Rules.cpp</locationURI>
+ </link>
+ <link>
+ <name>Rules.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Rules.h</locationURI>
+ </link>
+ <link>
+ <name>SyntaxTree.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/SyntaxTree.cpp</locationURI>
+ </link>
+ <link>
+ <name>SyntaxTree.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/SyntaxTree.h</locationURI>
+ </link>
+ <link>
+ <name>Word.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Word.cpp</locationURI>
+ </link>
+ <link>
+ <name>Word.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/Word.h</locationURI>
+ </link>
+ <link>
+ <name>gzfilebuf.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/gzfilebuf.h</locationURI>
+ </link>
+ <link>
+ <name>pugiconfig.hpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/pugiconfig.hpp</locationURI>
+ </link>
+ <link>
+ <name>pugixml.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/pugixml.cpp</locationURI>
+ </link>
+ <link>
+ <name>pugixml.hpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-mixed-syntax/pugixml.hpp</locationURI>
+ </link>
+ </linkedResources>
</projectDescription>
diff --git a/contrib/other-builds/extract-mixed-syntax/Global.cpp b/contrib/other-builds/extract-mixed-syntax/Global.cpp
deleted file mode 100644
index 27aeb4b95..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Global.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Global.cpp
- * extract
- *
- * Created by Hieu Hoang on 01/02/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-
-#include "Global.h"
-
-bool g_debug = false;
-
-Global::Global()
-: minHoleSpanSourceDefault(2)
-, maxHoleSpanSourceDefault(7)
-, minHoleSpanSourceSyntax(1)
-, maxHoleSpanSourceSyntax(1000)
-, maxUnaligned(5)
-
-, maxSymbols(5)
-, maxNonTerm(3)
-, maxNonTermDefault(2)
-
-// int minHoleSize(1)
-// int minSubPhraseSize(1) // minimum size of a remaining lexical phrase
-, glueGrammarFlag(false)
-, unknownWordLabelFlag(false)
-//bool zipFiles(false)
-, sourceSyntax(true)
-, targetSyntax(false)
-, mixed(true)
-, uppermostOnly(true)
-, allowDefaultNonTermEdge(true)
-, gzOutput(false)
-
-{}
diff --git a/contrib/other-builds/extract-mixed-syntax/Global.h b/contrib/other-builds/extract-mixed-syntax/Global.h
deleted file mode 100644
index 41cdbf0ce..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Global.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-/*
- * Global.h
- * extract
- *
- * Created by Hieu Hoang on 01/02/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <set>
-#include <map>
-#include <string>
-
-class Global
-{
-public:
- int minHoleSpanSourceDefault;
- int maxHoleSpanSourceDefault;
- int minHoleSpanSourceSyntax;
- int maxHoleSpanSourceSyntax;
-
- int maxSymbols;
- bool glueGrammarFlag;
- bool unknownWordLabelFlag;
- int maxNonTerm;
- int maxNonTermDefault;
- bool sourceSyntax;
- bool targetSyntax;
- bool mixed;
- int maxUnaligned;
- bool uppermostOnly;
- bool allowDefaultNonTermEdge;
- bool gzOutput;
-
- Global();
-
- Global(const Global&);
-
-};
-
-extern bool g_debug;
-
-#define DEBUG_OUTPUT() void DebugOutput() const;
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/Lattice.cpp b/contrib/other-builds/extract-mixed-syntax/Lattice.cpp
deleted file mode 100644
index 2b9ebac6e..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Lattice.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Lattice.cpp
- * extract
- *
- * Created by Hieu Hoang on 18/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-
-#include <cassert>
-#include "Lattice.h"
-#include "LatticeNode.h"
-#include "Tunnel.h"
-#include "TunnelCollection.h"
-#include "SyntaxTree.h"
-#include "SentenceAlignment.h"
-#include "tables-core.h"
-#include "Rule.h"
-#include "RuleCollection.h"
-
-using namespace std;
-
-Lattice::Lattice(size_t sourceSize)
-:m_stacks(sourceSize + 1)
-{
-}
-
-Lattice::~Lattice()
-{
- std::vector<Stack>::iterator iterStack;
- for (iterStack = m_stacks.begin(); iterStack != m_stacks.end(); ++iterStack)
- {
- Stack &stack = *iterStack;
- RemoveAllInColl(stack);
- }
-}
-
-void Lattice::CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global)
-{
- // term
- Stack &startStack = GetStack(startPos);
-
- LatticeNode *node = new LatticeNode(startPos, &sentence);
- startStack.push_back(node);
-
- // non-term
- for (size_t endPos = startPos + 1; endPos <= sentence.source.size(); ++endPos)
- {
- const TunnelList &tunnels = tunnelColl.GetTunnels(startPos, endPos - 1);
-
- TunnelList::const_iterator iterHole;
- for (iterHole = tunnels.begin(); iterHole != tunnels.end(); ++iterHole)
- {
- const Tunnel &tunnel = *iterHole;
- CreateArcsUsing1Hole(tunnel, sentence, global);
- }
- }
-}
-
-void Lattice::CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global)
-{
- size_t startPos = tunnel.GetRange(0).GetStartPos()
- , endPos = tunnel.GetRange(0).GetEndPos();
- size_t numSymbols = tunnel.GetRange(0).GetWidth();
- assert(numSymbols > 0);
-
- Stack &startStack = GetStack(startPos);
-
-
- // non-terms. cartesian product of source & target labels
- assert(startPos == tunnel.GetRange(0).GetStartPos() && endPos == tunnel.GetRange(0).GetEndPos());
- size_t startT = tunnel.GetRange(1).GetStartPos()
- ,endT = tunnel.GetRange(1).GetEndPos();
-
- const SyntaxNodes &nodesS = sentence.sourceTree.GetNodes(startPos, endPos);
- const SyntaxNodes &nodesT = sentence.targetTree.GetNodes(startT, endT );
-
- SyntaxNodes::const_iterator iterS, iterT;
- for (iterS = nodesS.begin(); iterS != nodesS.end(); ++iterS)
- {
- const SyntaxNode *syntaxNodeS = *iterS;
-
- for (iterT = nodesT.begin(); iterT != nodesT.end(); ++iterT)
- {
- const SyntaxNode *syntaxNodeT = *iterT;
-
- bool isSyntax = syntaxNodeS->IsSyntax() || syntaxNodeT->IsSyntax();
- size_t maxSourceNonTermSpan = isSyntax ? global.maxHoleSpanSourceSyntax : global.maxHoleSpanSourceDefault;
-
- if (maxSourceNonTermSpan >= endPos - startPos)
- {
- LatticeNode *node = new LatticeNode(tunnel, syntaxNodeS, syntaxNodeT);
- startStack.push_back(node);
- }
- }
- }
-}
-
-Stack &Lattice::GetStack(size_t startPos)
-{
- assert(startPos < m_stacks.size());
- return m_stacks[startPos];
-}
-
-const Stack &Lattice::GetStack(size_t startPos) const
-{
- assert(startPos < m_stacks.size());
- return m_stacks[startPos];
-}
-
-void Lattice::CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global)
-{
- const Stack &startStack = GetStack(startPos);
-
- Stack::const_iterator iterStack;
- for (iterStack = startStack.begin(); iterStack != startStack.end(); ++iterStack)
- {
- const LatticeNode *node = *iterStack;
- Rule *initRule = new Rule(node);
-
- if (initRule->CanRecurse(global, sentence.GetTunnelCollection()))
- { // may or maynot be valid, but can continue to build on this rule
- initRule->CreateRules(m_rules, *this, sentence, global);
- }
-
- if (initRule->IsValid(global, sentence.GetTunnelCollection()))
- { // add to rule collection
- m_rules.Add(global, initRule, sentence);
- }
- else
- {
- delete initRule;
- }
-
-
- }
-}
-
-Stack Lattice::GetNonTermNode(const Range &sourceRange) const
-{
- Stack ret;
- size_t sourcePos = sourceRange.GetStartPos();
-
- const Stack &origStack = GetStack(sourcePos);
- Stack::const_iterator iter;
- for (iter = origStack.begin(); iter != origStack.end(); ++iter)
- {
- LatticeNode *node = *iter;
- const Range &nodeRangeS = node->GetSourceRange();
-
- assert(nodeRangeS.GetStartPos() == sourceRange.GetStartPos());
-
- if (! node->IsTerminal() && nodeRangeS.GetEndPos() == sourceRange.GetEndPos())
- {
- ret.push_back(node);
- }
- }
-
- return ret;
-}
-
-std::ostream& operator<<(std::ostream &out, const Lattice &obj)
-{
- std::vector<Stack>::const_iterator iter;
- for (iter = obj.m_stacks.begin(); iter != obj.m_stacks.end(); ++iter)
- {
- const Stack &stack = *iter;
-
- Stack::const_iterator iterStack;
- for (iterStack = stack.begin(); iterStack != stack.end(); ++iterStack)
- {
- const LatticeNode &node = **iterStack;
- out << node << " ";
- }
- }
-
- return out;
-}
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/Lattice.h b/contrib/other-builds/extract-mixed-syntax/Lattice.h
deleted file mode 100644
index c88aa0844..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Lattice.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#pragma once
-/*
- * Lattice.h
- * extract
- *
- * Created by Hieu Hoang on 18/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <iostream>
-#include <vector>
-#include "RuleCollection.h"
-
-class Global;
-class LatticeNode;
-class Tunnel;
-class TunnelCollection;
-class SentenceAlignment;
-
-typedef std::vector<LatticeNode*> Stack;
-
-class Lattice
-{
- friend std::ostream& operator<<(std::ostream&, const Lattice&);
-
- std::vector<Stack> m_stacks;
- RuleCollection m_rules;
-
- Stack &GetStack(size_t endPos);
-
- void CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global);
-
-public:
- Lattice(size_t sourceSize);
- ~Lattice();
-
- void CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global);
- void CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global);
-
- const Stack &GetStack(size_t startPos) const;
- const RuleCollection &GetRules() const
- { return m_rules; }
-
- Stack GetNonTermNode(const Range &sourceRange) const;
-
-};
-
diff --git a/contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp b/contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp
deleted file mode 100644
index 8f0cbfc0f..000000000
--- a/contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * LatticeNode.cpp
- * extract
- *
- * Created by Hieu Hoang on 18/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <sstream>
-#include "LatticeNode.h"
-#include "SyntaxTree.h"
-#include "Tunnel.h"
-#include "SentenceAlignment.h"
-#include "SymbolSequence.h"
-
-size_t LatticeNode::s_count = 0;
-
-using namespace std;
-
-// for terms
-LatticeNode::LatticeNode(size_t pos, const SentenceAlignment *sentence)
-:m_tunnel(NULL)
-,m_isTerminal(true)
-,m_sourceTreeNode(NULL)
-,m_targetTreeNode(NULL)
-,m_sentence(sentence)
-,m_sourceRange(pos, pos)
-{
- s_count++;
- //cerr << *this << endl;
-}
-
-// for non-terms
-LatticeNode::LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode)
-:m_tunnel(&tunnel)
-,m_isTerminal(false)
-,m_sourceTreeNode(sourceTreeNode)
-,m_targetTreeNode(targetTreeNode)
-,m_sentence(NULL)
-,m_sourceRange(tunnel.GetRange(0))
-{
- s_count++;
- //cerr << *this << endl;
-}
-
-bool LatticeNode::IsSyntax() const
-{
- assert(!m_isTerminal);
- bool ret = m_sourceTreeNode->IsSyntax() || m_targetTreeNode->IsSyntax();
- return ret;
-}
-
-size_t LatticeNode::GetNumSymbols(size_t direction) const
-{
- return 1;
-}
-
-int LatticeNode::Compare(const LatticeNode &otherNode) const
-{
- int ret = 0;
- if (m_isTerminal != otherNode.m_isTerminal)
- {
- ret = m_isTerminal ? -1 : 1;
- }
-
- // both term or non-term
- else if (m_isTerminal)
- { // term. compare source span
- if (m_sourceRange.GetStartPos() == otherNode.m_sourceRange.GetStartPos())
- ret = 0;
- else
- ret = (m_sourceRange.GetStartPos() < otherNode.m_sourceRange.GetStartPos()) ? -1 : +1;
- }
- else
- { // non-term. compare source span and BOTH label
- assert(!m_isTerminal);
- assert(!otherNode.m_isTerminal);
-
- if (m_sourceTreeNode->IsSyntax())
- {
- ret = m_tunnel->Compare(*otherNode.m_tunnel, 0);
- if (ret == 0 && m_sourceTreeNode->GetLabel() != otherNode.m_sourceTreeNode->GetLabel())
- {
- ret = (m_sourceTreeNode->GetLabel() < otherNode.m_sourceTreeNode->GetLabel()) ? -1 : +1;
- }
- }
-
- if (ret == 0 && m_targetTreeNode->IsSyntax())
- {
- ret = m_tunnel->Compare(*otherNode.m_tunnel, 1);
- if (ret == 0 && m_targetTreeNode->GetLabel() != otherNode.m_targetTreeNode->GetLabel())
- {
- ret = (m_targetTreeNode->GetLabel() < otherNode.m_targetTreeNode->GetLabel()) ? -1 : +1;
- }
- }
- }
-
- return ret;
-}
-
-void LatticeNode::CreateSymbols(size_t direction, SymbolSequence &symbols) const
-{
- if (m_isTerminal)
- {
- /*
- const std::vector<std::string> &words = (direction == 0 ? m_sentence->source : m_sentence->target);
- size_t startPos = m_tunnel.GetStart(direction)
- ,endPos = m_tunnel.GetEnd(direction);
-
- for (size_t pos = startPos; pos <= endPos; ++pos)
- {
- Symbol symbol(words[pos], pos);
- symbols.Add(symbol);
- }
- */
- }
- else
- { // output both
-
- Symbol symbol(m_sourceTreeNode->GetLabel(), m_targetTreeNode->GetLabel()
- , m_tunnel->GetRange(0).GetStartPos(), m_tunnel->GetRange(0).GetEndPos()
- , m_tunnel->GetRange(1).GetStartPos(), m_tunnel->GetRange(1).GetEndPos()
- , m_sourceTreeNode->IsSyntax(), m_targetTreeNode->IsSyntax());
-
- symbols.Add(symbol);
- }
-
-}
-
-std::ostream& operator<<(std::ostream &out, const LatticeNode &obj)
-{
- if (obj.m_isTerminal)
- {
- assert(obj.m_sourceRange.GetWidth() == 1);
- size_t pos = obj.m_sourceRange.GetStartPos();
-
- const SentenceAlignment &sentence = *obj.m_sentence;
- out << obj.m_sourceRange << "=" << sentence.source[pos];
- }
- else
- {
- assert(obj.m_tunnel);
- out << obj.GetTunnel() << "=" << obj.m_sourceTreeNode->GetLabel() << obj.m_targetTreeNode->GetLabel() << " ";
- }
-
- return out;
-}
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/LatticeNode.h b/contrib/other-builds/extract-mixed-syntax/LatticeNode.h
deleted file mode 100644
index 73ea6a224..000000000
--- a/contrib/other-builds/extract-mixed-syntax/LatticeNode.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#pragma once
-/*
- * LatticeNode.h
- * extract
- *
- * Created by Hieu Hoang on 18/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <vector>
-#include <iostream>
-#include <cassert>
-#include "Range.h"
-
-class Tunnel;
-class SyntaxNode;
-class SentenceAlignment;
-class SymbolSequence;
-
-class LatticeNode
-{
- friend std::ostream& operator<<(std::ostream&, const LatticeNode&);
-
- bool m_isTerminal;
-
- // for terms & non-term
- Range m_sourceRange;
-
- // non-terms. source range should be same as m_sourceRange
- const Tunnel *m_tunnel;
-
-public:
- static size_t s_count;
-
-
-
- const SyntaxNode *m_sourceTreeNode, *m_targetTreeNode;
- const SentenceAlignment *m_sentence;
-
- // for terms
- LatticeNode(size_t pos, const SentenceAlignment *sentence);
-
- // for non-terms
- LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode);
-
- bool IsTerminal() const
- { return m_isTerminal; }
-
- bool IsSyntax() const;
-
- size_t GetNumSymbols(size_t direction) const;
-
- std::string ToString() const;
-
- int Compare(const LatticeNode &otherNode) const;
-
- void CreateSymbols(size_t direction, SymbolSequence &symbols) const;
-
- const Tunnel &GetTunnel() const
- {
- assert(m_tunnel);
- return *m_tunnel;
- }
-
- const Range &GetSourceRange() const
- {
- return m_sourceRange;
- }
- const SyntaxNode &GetSyntaxNode(size_t direction) const
- {
- const SyntaxNode *node = direction == 0 ? m_sourceTreeNode : m_targetTreeNode;
- assert(node);
- return *node;
- }
-
-};
-
diff --git a/contrib/other-builds/extract-mixed-syntax/Makefile b/contrib/other-builds/extract-mixed-syntax/Makefile
deleted file mode 100644
index b992b161f..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-all: extract
-
-clean:
- rm -f *.o extract-mixed-syntax
-
-.cpp.o:
- g++ -O6 -g -c $<
-
-extract: tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o
-
- g++ tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o -lz -lboost_iostreams-mt -o extract-mixed-syntax
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp b/contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp
deleted file mode 100644
index a61ce1ab1..000000000
--- a/contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2006 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-#include <boost/iostreams/filter/gzip.hpp>
-#include "OutputFileStream.h"
-#include "gzfilebuf.h"
-
-using namespace std;
-
-namespace Moses
-{
-OutputFileStream::OutputFileStream()
- :boost::iostreams::filtering_ostream()
- ,m_outFile(NULL)
-{
-}
-
-OutputFileStream::OutputFileStream(const std::string &filePath)
- : m_outFile(NULL)
-{
- Open(filePath);
-}
-
-OutputFileStream::~OutputFileStream()
-{
- Close();
-}
-
-bool OutputFileStream::Open(const std::string &filePath)
-{
- m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
- if (m_outFile->fail()) {
- return false;
- }
-
- if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
- this->push(boost::iostreams::gzip_compressor());
- }
- this->push(*m_outFile);
-
- return true;
-}
-
-void OutputFileStream::Close()
-{
- if (m_outFile == NULL) {
- return;
- }
-
- this->flush();
- this->pop(); // file
-
- m_outFile->close();
- delete m_outFile;
- m_outFile = NULL;
- return;
-}
-
-
-}
-
diff --git a/contrib/other-builds/extract-mixed-syntax/OutputFileStream.h b/contrib/other-builds/extract-mixed-syntax/OutputFileStream.h
deleted file mode 100644
index f52e36d76..000000000
--- a/contrib/other-builds/extract-mixed-syntax/OutputFileStream.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2006 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-#pragma once
-
-#include <cstdlib>
-#include <fstream>
-#include <string>
-#include <iostream>
-#include <boost/iostreams/filtering_stream.hpp>
-
-namespace Moses
-{
-
-/** Used in place of std::istream, can read zipped files if it ends in .gz
- */
-class OutputFileStream : public boost::iostreams::filtering_ostream
-{
-protected:
- std::ofstream *m_outFile;
-public:
- OutputFileStream();
-
- OutputFileStream(const std::string &filePath);
- virtual ~OutputFileStream();
-
- bool Open(const std::string &filePath);
- void Close();
-};
-
-}
-
diff --git a/contrib/other-builds/extract-mixed-syntax/Range.cpp b/contrib/other-builds/extract-mixed-syntax/Range.cpp
deleted file mode 100644
index a98ac278b..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Range.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Range.cpp
- * extract
- *
- * Created by Hieu Hoang on 22/02/2011.
- * Copyright 2011 __MyCompanyName__. All rights reserved.
- *
- */
-
-#include "Range.h"
-
-using namespace std;
-
-void Range::Merge(const Range &a, const Range &b)
-{
- if (a.m_startPos == NOT_FOUND)
- { // get the other regardless
- m_startPos = b.m_startPos;
- }
- else if (b.m_startPos == NOT_FOUND)
- {
- m_startPos = a.m_startPos;
- }
- else
- {
- m_startPos = min(a.m_startPos, b.m_startPos);
- }
-
- if (a.m_endPos == NOT_FOUND)
- { // get the other regardless
- m_endPos = b.m_endPos;
- }
- else if (b.m_endPos == NOT_FOUND)
- { // do nothing
- m_endPos = a.m_endPos;
- }
- else
- {
- m_endPos = max(a.m_endPos, b.m_endPos);
- }
-
-
-}
-
-int Range::Compare(const Range &other) const
-{
- if (m_startPos < other.m_startPos)
- return -1;
- else if (m_startPos > other.m_startPos)
- return +1;
- else if (m_endPos < other.m_endPos)
- return -1;
- else if (m_endPos > other.m_endPos)
- return +1;
-
- return 0;
-
-}
-
-bool Range::Overlap(const Range &other) const
-{
- if ( other.m_endPos < m_startPos || other.m_startPos > m_endPos)
- return false;
-
- return true;
-}
-
-std::ostream& operator<<(std::ostream &out, const Range &range)
-{
- out << "[" << range.m_startPos << "-" << range.m_endPos << "]";
- return out;
-}
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/Range.h b/contrib/other-builds/extract-mixed-syntax/Range.h
deleted file mode 100644
index 05d0c97c9..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Range.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Range.h
- * extract
- *
- * Created by Hieu Hoang on 22/02/2011.
- * Copyright 2011 __MyCompanyName__. All rights reserved.
- *
- */
-#pragma once
-#include <string>
-#include <iostream>
-#include <limits>
-
-#define NOT_FOUND std::numeric_limits<size_t>::max()
-
-class Range
-{
- friend std::ostream& operator<<(std::ostream&, const Range&);
-
- size_t m_startPos, m_endPos;
-public:
-
- Range()
- :m_startPos(NOT_FOUND)
- ,m_endPos(NOT_FOUND)
- {}
-
- Range(const Range &copy)
- :m_startPos(copy.m_startPos)
- ,m_endPos(copy.m_endPos)
- {}
-
- Range(size_t startPos, size_t endPos)
- :m_startPos(startPos)
- ,m_endPos(endPos)
- {}
-
- size_t GetStartPos() const
- { return m_startPos; }
- size_t GetEndPos() const
- { return m_endPos; }
- size_t GetWidth() const
- { return m_endPos - m_startPos + 1; }
-
- void SetStartPos(size_t startPos)
- { m_startPos = startPos; }
- void SetEndPos(size_t endPos)
- { m_endPos = endPos; }
-
- void Merge(const Range &a, const Range &b);
-
- int Compare(const Range &other) const;
-
- bool Overlap(const Range &other) const;
-
-
-};
diff --git a/contrib/other-builds/extract-mixed-syntax/Rule.cpp b/contrib/other-builds/extract-mixed-syntax/Rule.cpp
deleted file mode 100644
index 7cc7d3a6f..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Rule.cpp
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * Rule.cpp
- * extract
- *
- * Created by Hieu Hoang on 19/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <algorithm>
-#include <sstream>
-#include "Rule.h"
-#include "Global.h"
-#include "LatticeNode.h"
-#include "Lattice.h"
-#include "SentenceAlignment.h"
-#include "Tunnel.h"
-#include "TunnelCollection.h"
-#include "RuleCollection.h"
-
-using namespace std;
-
-RuleElement::RuleElement(const RuleElement &copy)
-:m_latticeNode(copy.m_latticeNode)
-,m_alignmentPos(copy.m_alignmentPos)
-{
-}
-
-
-Rule::Rule(const LatticeNode *latticeNode)
-:m_lhs(NULL)
-{
- RuleElement element(*latticeNode);
-
- m_coll.push_back(element);
-}
-
-Rule::Rule(const Rule &prevRule, const LatticeNode *latticeNode)
-:m_coll(prevRule.m_coll)
-,m_lhs(NULL)
-{
- RuleElement element(*latticeNode);
- m_coll.push_back(element);
-}
-
-Rule::Rule(const Global &global, bool &isValid, const Rule &copy, const LatticeNode *lhs, const SentenceAlignment &sentence)
-:m_coll(copy.m_coll)
-,m_source(copy.m_source)
-,m_target(copy.m_target)
-,m_lhs(lhs)
-{
- CreateSymbols(global, isValid, sentence);
-}
-
-Rule::~Rule()
-{
-}
-
-// helper for sort
-struct CompareLatticeNodeTarget
-{
- bool operator() (const RuleElement *a, const RuleElement *b)
- {
- const Range &rangeA = a->GetLatticeNode().GetTunnel().GetRange(1)
- ,&rangeB = b->GetLatticeNode().GetTunnel().GetRange(1);
- return rangeA.GetEndPos() < rangeB.GetEndPos();
- }
-};
-
-void Rule::CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence)
-{
- vector<RuleElement*> nonTerms;
-
- // source
- for (size_t ind = 0; ind < m_coll.size(); ++ind)
- {
- RuleElement &element = m_coll[ind];
- const LatticeNode &node = element.GetLatticeNode();
- if (node.IsTerminal())
- {
- size_t sourcePos = node.GetSourceRange().GetStartPos();
- const string &word = sentence.source[sourcePos];
- Symbol symbol(word, sourcePos);
- m_source.Add(symbol);
- }
- else
- { // non-term
- const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
- const string &targetWord = node.GetSyntaxNode(1).GetLabel();
- Symbol symbol(sourceWord, targetWord
- , node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
- , node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
- , node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
- m_source.Add(symbol);
-
- // store current pos within phrase
- element.m_alignmentPos.first = ind;
-
- // for target symbols
- nonTerms.push_back(&element);
- }
-
- }
-
- // target
- isValid = true;
-
- const Range &lhsTargetRange = m_lhs->GetTunnel().GetRange(1);
-
- // check spans of target non-terms
- if (nonTerms.size())
- {
- // sort non-term rules elements by target range
- std::sort(nonTerms.begin(), nonTerms.end(), CompareLatticeNodeTarget());
-
- const Range &first = nonTerms.front()->GetLatticeNode().GetTunnel().GetRange(1);
- const Range &last = nonTerms.back()->GetLatticeNode().GetTunnel().GetRange(1);
-
- if (first.GetStartPos() < lhsTargetRange.GetStartPos()
- || last.GetEndPos() > lhsTargetRange.GetEndPos())
- {
- isValid = false;
- }
- }
-
- if (isValid)
- {
- size_t indNonTerm = 0;
- RuleElement *currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;
- for (size_t targetPos = lhsTargetRange.GetStartPos(); targetPos <= lhsTargetRange.GetEndPos(); ++targetPos)
- {
- if (currNonTermElement && targetPos == currNonTermElement->GetLatticeNode().GetTunnel().GetRange(1).GetStartPos())
- { // start of a non-term. print out non-terms & skip to the end
-
- const LatticeNode &node = currNonTermElement->GetLatticeNode();
-
- const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
- const string &targetWord = node.GetSyntaxNode(1).GetLabel();
- Symbol symbol(sourceWord, targetWord
- , node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
- , node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
- , node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
- m_target.Add(symbol);
-
- // store current pos within phrase
- currNonTermElement->m_alignmentPos.second = m_target.GetSize() - 1;
-
- assert(currNonTermElement->m_alignmentPos.first != NOT_FOUND);
-
- targetPos = node.GetTunnel().GetRange(1).GetEndPos();
- indNonTerm++;
- currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;
- }
- else
- { // term
- const string &word = sentence.target[targetPos];
-
- Symbol symbol(word, targetPos);
- m_target.Add(symbol);
-
- }
- }
-
- assert(indNonTerm == nonTerms.size());
-
- if (m_target.GetSize() > global.maxSymbols) {
- isValid = false;
- //cerr << "m_source=" << m_source.GetSize() << ":" << m_source << endl;
- //cerr << "m_target=" << m_target.GetSize() << ":" << m_target << endl;
- }
- }
-}
-
-bool Rule::MoreDefaultNonTermThanTerm() const
-{
- size_t numTerm = 0, numDefaultNonTerm = 0;
-
- CollType::const_iterator iter;
- for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
- {
- const RuleElement &element = *iter;
- const LatticeNode &node = element.GetLatticeNode();
- if (node.IsTerminal())
- {
- ++numTerm;
- }
- else if (!node.IsSyntax())
- {
- ++numDefaultNonTerm;
- }
- }
-
- bool ret = numDefaultNonTerm > numTerm;
- return ret;
-}
-
-bool Rule::SourceHasEdgeDefaultNonTerm() const
-{
- assert(m_coll.size());
- const LatticeNode &first = m_coll.front().GetLatticeNode();
- const LatticeNode &last = m_coll.back().GetLatticeNode();
-
- // 1st
- if (!first.IsTerminal() && !first.IsSyntax())
- {
- return true;
- }
- if (!last.IsTerminal() && !last.IsSyntax())
- {
- return true;
- }
-
- return false;
-}
-
-bool Rule::IsValid(const Global &global, const TunnelCollection &tunnelColl) const
-{
- if (m_coll.size() == 1 && !m_coll[0].GetLatticeNode().IsTerminal()) // can't be only 1 terminal
- {
- return false;
- }
-
- if (MoreDefaultNonTermThanTerm())
- { // must have at least as many terms as non-syntax non-terms
- return false;
- }
-
- if (!global.allowDefaultNonTermEdge && SourceHasEdgeDefaultNonTerm())
- {
- return false;
- }
-
- if (GetNumSymbols() > global.maxSymbols)
- {
- return false;
- }
-
- if (AdjacentDefaultNonTerms())
- {
- return false;
- }
-
- if (!IsHole(tunnelColl))
- {
- return false;
- }
-
- if (NonTermOverlap())
- {
- return false;
- }
-
- /*
- std::pair<size_t, size_t> spanS = GetSpan(0)
- ,spanT= GetSpan(1);
-
- if (tunnelColl.NumUnalignedWord(0, spanS.first, spanS.second) >= global.maxUnaligned)
- return false;
- if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
- return false;
- */
-
- return true;
-}
-
-bool Rule::NonTermOverlap() const
-{
- vector<Range> ranges;
-
- CollType::const_iterator iter;
- for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
- {
- const RuleElement &element = *iter;
- if (!element.GetLatticeNode().IsTerminal())
- {
- const Range &range = element.GetLatticeNode().GetTunnel().GetRange(1);
- ranges.push_back(range);
- }
- }
-
- vector<Range>::const_iterator outerIter;
- for (outerIter = ranges.begin(); outerIter != ranges.end(); ++outerIter)
- {
- const Range &outer = *outerIter;
- vector<Range>::const_iterator innerIter;
- for (innerIter = outerIter + 1; innerIter != ranges.end(); ++innerIter)
- {
- const Range &inner = *innerIter;
- if (outer.Overlap(inner))
- return true;
- }
- }
-
- return false;
-}
-
-Range Rule::GetSourceRange() const
-{
- assert(m_coll.size());
- const Range &first = m_coll.front().GetLatticeNode().GetSourceRange();
- const Range &last = m_coll.back().GetLatticeNode().GetSourceRange();
-
- Range ret(first.GetStartPos(), last.GetEndPos());
- return ret;
-}
-
-
-bool Rule::IsHole(const TunnelCollection &tunnelColl) const
-{
- const Range &spanS = GetSourceRange();
- const TunnelList &tunnels = tunnelColl.GetTunnels(spanS.GetStartPos(), spanS.GetEndPos());
-
- bool ret = tunnels.size() > 0;
- return ret;
-}
-
-
-bool Rule::CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const
-{
- if (GetNumSymbols() >= global.maxSymbols)
- return false;
- if (AdjacentDefaultNonTerms())
- return false;
- if (MaxNonTerm(global))
- return false;
- if (NonTermOverlap())
- {
- return false;
- }
-
- const Range spanS = GetSourceRange();
-
- if (tunnelColl.NumUnalignedWord(0, spanS.GetStartPos(), spanS.GetEndPos()) >= global.maxUnaligned)
- return false;
-// if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
-// return false;
-
-
- return true;
-}
-
-bool Rule::MaxNonTerm(const Global &global) const
-{
- //cerr << *this << endl;
- size_t numNonTerm = 0, numNonTermDefault = 0;
-
- CollType::const_iterator iter;
- for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
- {
- const LatticeNode *node = &(*iter).GetLatticeNode();
- if (!node->IsTerminal() )
- {
- numNonTerm++;
- if (!node->IsSyntax())
- {
- numNonTermDefault++;
- }
- if (numNonTerm >= global.maxNonTerm || numNonTermDefault >= global.maxNonTermDefault)
- return true;
- }
- }
-
- return false;
-}
-
-
-bool Rule::AdjacentDefaultNonTerms() const
-{
- assert(m_coll.size() > 0);
-
- const LatticeNode *prevNode = &m_coll.front().GetLatticeNode();
- CollType::const_iterator iter;
- for (iter = m_coll.begin() + 1; iter != m_coll.end(); ++iter)
- {
- const LatticeNode *node = &(*iter).GetLatticeNode();
- if (!prevNode->IsTerminal() && !node->IsTerminal() && !prevNode->IsSyntax() && !node->IsSyntax() )
- {
- return true;
- }
- prevNode = node;
- }
-
- return false;
-}
-
-
-
-size_t Rule::GetNumSymbols() const
-{
- size_t ret = m_coll.size();
- return ret;
-}
-
-void Rule::CreateRules(RuleCollection &rules
- , const Lattice &lattice
- , const SentenceAlignment &sentence
- , const Global &global)
-{
- assert(m_coll.size() > 0);
- const LatticeNode *latticeNode = &m_coll.back().GetLatticeNode();
- size_t endPos = latticeNode->GetSourceRange().GetEndPos() + 1;
-
- const Stack &stack = lattice.GetStack(endPos);
-
- Stack::const_iterator iter;
- for (iter = stack.begin(); iter != stack.end(); ++iter)
- {
- const LatticeNode *newLatticeNode = *iter;
- Rule *newRule = new Rule(*this, newLatticeNode);
- //cerr << *newRule << endl;
-
- if (newRule->CanRecurse(global, sentence.GetTunnelCollection()))
- { // may or maynot be valid, but can continue to build on this rule
- newRule->CreateRules(rules, lattice, sentence, global);
- }
-
- if (newRule->IsValid(global, sentence.GetTunnelCollection()))
- { // add to rule collection
- rules.Add(global, newRule, sentence);
- }
- else
- {
- delete newRule;
- }
-
- }
-}
-
-bool Rule::operator<(const Rule &compare) const
-{
- /*
- if (g_debug)
- {
- cerr << *this << endl << compare;
- cerr << endl;
- }
- */
-
- bool ret = Compare(compare) < 0;
-
- /*
- if (g_debug)
- {
- cerr << *this << endl << compare << endl << ret << endl << endl;
- }
- */
-
- return ret;
-}
-
-int Rule::Compare(const Rule &compare) const
-{
- //cerr << *this << endl << compare << endl;
- assert(m_coll.size() > 0);
- assert(m_source.GetSize() > 0);
- assert(m_target.GetSize() > 0);
-
- int ret = 0;
-
- // compare each fragment
- ret = m_source.Compare(compare.m_source);
- if (ret != 0)
- {
- return ret;
- }
-
- ret = m_target.Compare(compare.m_target);
- if (ret != 0)
- {
- return ret;
- }
-
- // compare lhs
- const string &thisSourceLabel = m_lhs->GetSyntaxNode(0).GetLabel();
- const string &otherSourceLabel = compare.m_lhs->GetSyntaxNode(0).GetLabel();
- if (thisSourceLabel != otherSourceLabel)
- {
- ret = (thisSourceLabel < otherSourceLabel) ? -1 : +1;
- return ret;
- }
-
- const string &thisTargetLabel = m_lhs->GetSyntaxNode(1).GetLabel();
- const string &otherTargetLabel = compare.m_lhs->GetSyntaxNode(1).GetLabel();
- if (thisTargetLabel != otherTargetLabel)
- {
- ret = (thisTargetLabel < otherTargetLabel) ? -1 : +1;
- return ret;
- }
-
- assert(ret == 0);
- return ret;
-}
-
-
-const LatticeNode &Rule::GetLatticeNode(size_t ind) const
-{
- assert(ind < m_coll.size());
- return m_coll[ind].GetLatticeNode();
-}
-
-void Rule::DebugOutput() const
-{
- Output(cerr);
-}
-
-void Rule::Output(std::ostream &out) const
-{
-
- stringstream strmeS, strmeT;
-
- std::vector<Symbol>::const_iterator iterSymbol;
- for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
- {
- const Symbol &symbol = *iterSymbol;
- strmeS << symbol << " ";
- }
-
- for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
- {
- const Symbol &symbol = *iterSymbol;
- strmeT << symbol << " ";
- }
-
- // lhs
- if (m_lhs)
- {
- strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
- strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
- }
-
- out << strmeS.str() << " ||| " << strmeT.str() << " ||| ";
-
- // alignment
- Rule::CollType::const_iterator iter;
- for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
- {
- const RuleElement &element = *iter;
- const LatticeNode &node = element.GetLatticeNode();
- bool isTerminal = node.IsTerminal();
-
- if (!isTerminal)
- {
- out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " ";
- }
- }
-
- out << "||| 1";
-
-}
-
-void Rule::OutputInv(std::ostream &out) const
-{
- stringstream strmeS, strmeT;
-
- std::vector<Symbol>::const_iterator iterSymbol;
- for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
- {
- const Symbol &symbol = *iterSymbol;
- strmeS << symbol << " ";
- }
-
- for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
- {
- const Symbol &symbol = *iterSymbol;
- strmeT << symbol << " ";
- }
-
- // lhs
- if (m_lhs)
- {
- strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
- strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
- }
-
- out << strmeT.str() << " ||| " << strmeS.str() << " ||| ";
-
- // alignment
- Rule::CollType::const_iterator iter;
- for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
- {
- const RuleElement &element = *iter;
- const LatticeNode &node = element.GetLatticeNode();
- bool isTerminal = node.IsTerminal();
-
- if (!isTerminal)
- {
- out << element.m_alignmentPos.second << "-" << element.m_alignmentPos.first << " ";
- }
- }
-
- out << "||| 1";
-
-}
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/Rule.h b/contrib/other-builds/extract-mixed-syntax/Rule.h
deleted file mode 100644
index 3574094fe..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Rule.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#pragma once
-/*
- * Rule.h
- * extract
- *
- * Created by Hieu Hoang on 19/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <vector>
-#include <iostream>
-#include "LatticeNode.h"
-#include "SymbolSequence.h"
-#include "Global.h"
-
-class Lattice;
-class SentenceAlignment;
-class Global;
-class RuleCollection;
-class SyntaxNode;
-class TunnelCollection;
-class Range;
-
-class RuleElement
-{
-protected:
- const LatticeNode *m_latticeNode;
-public:
- std::pair<size_t, size_t> m_alignmentPos;
-
- RuleElement(const RuleElement &copy);
- RuleElement(const LatticeNode &latticeNode)
- :m_latticeNode(&latticeNode)
- ,m_alignmentPos(NOT_FOUND, NOT_FOUND)
- {}
-
- const LatticeNode &GetLatticeNode() const
- { return *m_latticeNode; }
-
-};
-
-class Rule
-{
-protected:
- typedef std::vector<RuleElement> CollType;
- CollType m_coll;
-
- const LatticeNode *m_lhs;
- SymbolSequence m_source, m_target;
-
- bool IsHole(const TunnelCollection &tunnelColl) const;
- bool NonTermOverlap() const;
-
- const LatticeNode &GetLatticeNode(size_t ind) const;
- void CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence);
-
-public:
- // init
- Rule(const LatticeNode *latticeNode);
-
- // create new rule by appending node to prev rule
- Rule(const Rule &prevRule, const LatticeNode *latticeNode);
-
- // create copy with lhs
- Rule(const Global &global, bool &isValid, const Rule &copy, const LatticeNode *lhs, const SentenceAlignment &sentence);
-
- // can continue to add to this rule
- bool CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const;
-
- virtual ~Rule();
-
- // can add this to the set of rules
- bool IsValid(const Global &global, const TunnelCollection &tunnelColl) const;
-
- size_t GetNumSymbols() const;
- bool AdjacentDefaultNonTerms() const;
- bool MaxNonTerm(const Global &global) const;
- bool MoreDefaultNonTermThanTerm() const;
- bool SourceHasEdgeDefaultNonTerm() const;
-
- void CreateRules(RuleCollection &rules
- , const Lattice &lattice
- , const SentenceAlignment &sentence
- , const Global &global);
-
- int Compare(const Rule &compare) const;
- bool operator<(const Rule &compare) const;
-
- Range GetSourceRange() const;
-
- DEBUG_OUTPUT();
-
- void Output(std::ostream &out) const;
- void OutputInv(std::ostream &out) const;
-
-};
diff --git a/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp b/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp
deleted file mode 100644
index 8389a70cf..000000000
--- a/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * RuleCollection.cpp
- * extract
- *
- * Created by Hieu Hoang on 19/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include "RuleCollection.h"
-#include "Rule.h"
-#include "SentenceAlignment.h"
-#include "tables-core.h"
-#include "Lattice.h"
-#include "SyntaxTree.h"
-
-using namespace std;
-
-RuleCollection::~RuleCollection()
-{
- RemoveAllInColl(m_coll);
-}
-
-void RuleCollection::Add(const Global &global, Rule *rule, const SentenceAlignment &sentence)
-{
- Range spanS = rule->GetSourceRange();
-
- // cartesian product of lhs
- Stack nontermNodes = sentence.GetLattice().GetNonTermNode(spanS);
- Stack::const_iterator iterStack;
- for (iterStack = nontermNodes.begin(); iterStack != nontermNodes.end(); ++iterStack)
- {
- const LatticeNode &node = **iterStack;
- assert(!node.IsTerminal());
-
- bool isValid;
- // create rules with LHS
- //cerr << "old:" << *rule << endl;
- Rule *newRule = new Rule(global, isValid, *rule, &node, sentence);
-
- if (!isValid)
- { // lhs doesn't match non-term spans
- delete newRule;
- continue;
- }
-
- /*
- stringstream s;
- s << *newRule;
- if (s.str().find("Wiederaufnahme der [X] ||| resumption of the [X] ||| ||| 1") == 0)
- {
- cerr << "READY:" << *newRule << endl;
- g_debug = true;
- }
- else {
- g_debug = false;
- }
- */
-
- typedef set<const Rule*, CompareRule>::iterator Iterator;
- pair<Iterator,bool> ret = m_coll.insert(newRule);
-
- if (ret.second)
- {
- //cerr << "ACCEPTED:" << *newRule << endl;
- //cerr << "";
- }
- else
- {
- //cerr << "REJECTED:" << *newRule << endl;
- delete newRule;
- }
-
- }
-
- delete rule;
-
-}
-
-void RuleCollection::Output(std::ostream &out) const
-{
- RuleCollection::CollType::const_iterator iter;
- for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
- {
- const Rule &rule = **iter;
- rule.Output(out);
- out << endl;
- }
-}
-
-void RuleCollection::OutputInv(std::ostream &out) const
-{
- RuleCollection::CollType::const_iterator iter;
- for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
- {
- const Rule &rule = **iter;
- rule.OutputInv(out);
- out << endl;
- }
-}
-
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/RuleCollection.h b/contrib/other-builds/extract-mixed-syntax/RuleCollection.h
deleted file mode 100644
index 27d5d794a..000000000
--- a/contrib/other-builds/extract-mixed-syntax/RuleCollection.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-/*
- * RuleCollection.h
- * extract
- *
- * Created by Hieu Hoang on 19/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <set>
-#include <iostream>
-#include "Rule.h"
-
-class SentenceAlignment;
-
-// helper for sort. Don't compare default non-terminals
-struct CompareRule
-{
- bool operator() (const Rule *a, const Rule *b)
- {
- /*
- if (g_debug)
- {
- std::cerr << std::endl << (*a) << std::endl << (*b) << " ";
- }
- */
- bool ret = (*a) < (*b);
- /*
- if (g_debug)
- {
- std::cerr << ret << std::endl;
- }
- */
- return ret;
- }
-};
-
-
-class RuleCollection
-{
-protected:
- typedef std::set<const Rule*, CompareRule> CollType;
- CollType m_coll;
-
-public:
- ~RuleCollection();
- void Add(const Global &global, Rule *rule, const SentenceAlignment &sentence);
- size_t GetSize() const
- { return m_coll.size(); }
-
- void Output(std::ostream &out) const;
- void OutputInv(std::ostream &out) const;
-
-};
-
diff --git a/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp b/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp
deleted file mode 100644
index b13743bc1..000000000
--- a/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * SentenceAlignment.cpp
- * extract
- *
- * Created by Hieu Hoang on 19/01/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <set>
-#include <map>
-#include <sstream>
-#include "SentenceAlignment.h"
-#include "XmlTree.h"
-#include "tables-core.h"
-#include "TunnelCollection.h"
-#include "Lattice.h"
-#include "LatticeNode.h"
-
-using namespace std;
-
-extern std::set< std::string > targetLabelCollection, sourceLabelCollection;
-extern std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;
-
-SentenceAlignment::SentenceAlignment()
-:m_tunnelCollection(NULL)
-,m_lattice(NULL)
-{}
-
-SentenceAlignment::~SentenceAlignment()
-{
- delete m_tunnelCollection;
- delete m_lattice;
-}
-
-int SentenceAlignment::Create( const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global )
-{
-
- // tokenizing English (and potentially extract syntax spans)
- if (global.targetSyntax) {
- string targetStringCPP = string(targetString);
- ProcessAndStripXMLTags( targetStringCPP, targetTree, targetLabelCollection , targetTopLabelCollection );
- target = tokenize( targetStringCPP.c_str() );
- // cerr << "E: " << targetStringCPP << endl;
- }
- else {
- target = tokenize( targetString.c_str() );
- }
-
- // tokenizing source (and potentially extract syntax spans)
- if (global.sourceSyntax) {
- string sourceStringCPP = string(sourceString);
- ProcessAndStripXMLTags( sourceStringCPP, sourceTree, sourceLabelCollection , sourceTopLabelCollection );
- source = tokenize( sourceStringCPP.c_str() );
- // cerr << "F: " << sourceStringCPP << endl;
- }
- else {
- source = tokenize( sourceString.c_str() );
- }
-
- // check if sentences are empty
- if (target.size() == 0 || source.size() == 0) {
- cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
- cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
- return 0;
- }
-
- // prepare data structures for alignments
- for(int i=0; i<source.size(); i++) {
- alignedCountS.push_back( 0 );
- }
- for(int i=0; i<target.size(); i++) {
- vector< int > dummy;
- alignedToT.push_back( dummy );
- }
-
- //InitTightest(m_s2tTightest, source.size());
- //InitTightest(m_t2sTightest, target.size());
-
-
- // reading in alignments
- vector<string> alignmentSequence = tokenize( alignmentString.c_str() );
- for(int i=0; i<alignmentSequence.size(); i++) {
- int s,t;
- // cout << "scaning " << alignmentSequence[i].c_str() << endl;
- if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
- cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
- cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
- return 0;
- }
- // cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
- if (t >= target.size() || s >= source.size()) {
- cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
- cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
- return 0;
- }
- alignedToT[t].push_back( s );
- alignedCountS[s]++;
-
- //SetAlignment(s, t);
- }
-
- bool mixed = global.mixed;
- sourceTree.AddDefaultNonTerms(global.sourceSyntax, mixed, source.size());
- targetTree.AddDefaultNonTerms(global.targetSyntax, mixed, target.size());
-
- //CalcTightestSpan(m_s2tTightest);
- //CalcTightestSpan(m_t2sTightest);
-
- return 1;
-}
-
-/*
-void SentenceAlignment::InitTightest(Outer &tightest, size_t len)
-{
- tightest.resize(len);
-
- for (size_t posOuter = 0; posOuter < len; ++posOuter)
- {
- Inner &inner = tightest[posOuter];
- size_t innerSize = len - posOuter;
- inner.resize(innerSize);
-
- }
-}
-
-void SentenceAlignment::CalcTightestSpan(Outer &tightest)
-{
- size_t len = tightest.size();
-
- for (size_t startPos = 0; startPos < len; ++startPos)
- {
- for (size_t endPos = startPos + 1; endPos < len; ++endPos)
- {
- const Range &prevRange = GetTightest(tightest, startPos, endPos - 1);
- const Range &smallRange = GetTightest(tightest, endPos, endPos);
- Range &newRange = GetTightest(tightest, startPos, endPos);
-
- newRange.Merge(prevRange, smallRange);
- //cerr << "[" << startPos << "-" << endPos << "] --> [" << newRange.GetStartPos() << "-" << newRange.GetEndPos() << "]";
- }
- }
-}
-
-Range &SentenceAlignment::GetTightest(Outer &tightest, size_t startPos, size_t endPos)
-{
- assert(endPos < tightest.size());
- assert(endPos >= startPos);
-
- Inner &inner = tightest[startPos];
-
- size_t ind = endPos - startPos;
- Range &ret = inner[ind];
- return ret;
-}
-
-void SentenceAlignment::SetAlignment(size_t source, size_t target)
-{
- SetAlignment(m_s2tTightest, source, target);
- SetAlignment(m_t2sTightest, target, source);
-}
-
-void SentenceAlignment::SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos)
-{
-
- Range &range = GetTightest(tightest, thisPos, thisPos);
- if (range.GetStartPos() == NOT_FOUND)
- { // not yet set, do them both
- assert(range.GetEndPos() == NOT_FOUND);
- range.SetStartPos(thatPos);
- range.SetEndPos(thatPos);
- }
- else
- {
- assert(range.GetEndPos() != NOT_FOUND);
- range.SetStartPos( (range.GetStartPos() > thatPos) ? thatPos : range.GetStartPos() );
- range.SetEndPos( (range.GetEndPos() < thatPos) ? thatPos : range.GetEndPos() );
- }
-}
- */
-
-
-void SentenceAlignment::FindTunnels(const Global &global )
-{
- int countT = target.size();
- int countS = source.size();
- int maxSpan = max(global.maxHoleSpanSourceDefault, global.maxHoleSpanSourceSyntax);
-
- m_tunnelCollection = new TunnelCollection(countS);
-
- m_tunnelCollection->alignedCountS = alignedCountS;
- m_tunnelCollection->alignedCountT.resize(alignedToT.size());
- for (size_t ind = 0; ind < alignedToT.size(); ind++)
- {
- m_tunnelCollection->alignedCountT[ind] = alignedToT[ind].size();
- }
-
- // phrase repository for creating hiero phrases
-
- // check alignments for target phrase startT...endT
- for(int lengthT=1;
- lengthT <= maxSpan && lengthT <= countT;
- lengthT++) {
- for(int startT=0; startT < countT-(lengthT-1); startT++) {
-
- // that's nice to have
- int endT = startT + lengthT - 1;
-
- // if there is target side syntax, there has to be a node
- if (global.targetSyntax && !targetTree.HasNode(startT,endT))
- continue;
-
- // find find aligned source words
- // first: find minimum and maximum source word
- int minS = 9999;
- int maxS = -1;
- vector< int > usedS = alignedCountS;
- for(int ti=startT;ti<=endT;ti++) {
- for(int i=0;i<alignedToT[ti].size();i++) {
- int si = alignedToT[ti][i];
- // cerr << "point (" << si << ", " << ti << ")\n";
- if (si<minS) { minS = si; }
- if (si>maxS) { maxS = si; }
- usedS[ si ]--;
- }
- }
-
- // unaligned phrases are not allowed
- if( maxS == -1 )
- continue;
-
- // source phrase has to be within limits
- if( maxS-minS >= maxSpan )
- {
- continue;
- }
-
- // check if source words are aligned to out of bound target words
- bool out_of_bounds = false;
- for(int si=minS;si<=maxS && !out_of_bounds;si++)
- {
- if (usedS[si]>0) {
- out_of_bounds = true;
- }
- }
-
- // if out of bound, you gotta go
- if (out_of_bounds)
- continue;
-
- if (m_tunnelCollection->NumUnalignedWord(1, startT, endT) >= global.maxUnaligned)
- continue;
-
- // done with all the checks, lets go over all consistent phrase pairs
- // start point of source phrase may retreat over unaligned
- for(int startS=minS;
- (startS>=0 &&
- startS>maxS - maxSpan && // within length limit
- (startS==minS || alignedCountS[startS]==0)); // unaligned
- startS--)
- {
- // end point of source phrase may advance over unaligned
- for(int endS=maxS;
- (endS<countS && endS<startS + maxSpan && // within length limit
- (endS==maxS || alignedCountS[endS]==0)); // unaligned
- endS++)
- {
- if (m_tunnelCollection->NumUnalignedWord(0, startS, endS) >= global.maxUnaligned)
- continue;
-
- // take note that this is a valid phrase alignment
- m_tunnelCollection->Add(startS, endS, startT, endT);
- }
- }
- }
- }
-
- //cerr << *tunnelCollection << endl;
-
-}
-
-void SentenceAlignment::CreateLattice(const Global &global)
-{
- size_t countS = source.size();
- m_lattice = new Lattice(countS);
-
- for (size_t startPos = 0; startPos < countS; ++startPos)
- {
- //cerr << "creating arcs for " << startPos << "=";
- m_lattice->CreateArcs(startPos, *m_tunnelCollection, *this, global);
-
- //cerr << LatticeNode::s_count << endl;
- }
-}
-
-void SentenceAlignment::CreateRules(const Global &global)
-{
- size_t countS = source.size();
-
- for (size_t startPos = 0; startPos < countS; ++startPos)
- {
- //cerr << "creating rules for " << startPos << "\n";
- m_lattice->CreateRules(startPos, *this, global);
- }
-}
-
-void OutputSentenceStr(std::ostream &out, const std::vector<std::string> &vec)
-{
- for (size_t pos = 0; pos < vec.size(); ++pos)
- {
- out << vec[pos] << " ";
- }
-}
-
-std::ostream& operator<<(std::ostream &out, const SentenceAlignment &obj)
-{
- OutputSentenceStr(out, obj.target);
- out << " ==> ";
- OutputSentenceStr(out, obj.source);
- out << endl;
-
- out << *obj.m_tunnelCollection;
-
- if (obj.m_lattice)
- out << endl << *obj.m_lattice;
-
- return out;
-}
-
-
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h b/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h
deleted file mode 100644
index a94941309..000000000
--- a/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-/*
- * SentenceAlignment.h
- * extract
- *
- * Created by Hieu Hoang on 19/01/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <vector>
-#include <cassert>
-#include <iostream>
-#include "SyntaxTree.h"
-#include "Global.h"
-#include "Range.h"
-
-class TunnelCollection;
-class Lattice;
-
-class SentenceAlignment
-{
- friend std::ostream& operator<<(std::ostream&, const SentenceAlignment&);
-
-public:
- std::vector<std::string> target;
- std::vector<std::string> source;
- std::vector<int> alignedCountS;
- std::vector< std::vector<int> > alignedToT;
- SyntaxTree sourceTree, targetTree;
-
- //typedef std::vector<Range> Inner;
- //typedef std::vector<Inner> Outer;
-
- //Outer m_s2tTightest, m_t2sTightest;
-
- SentenceAlignment();
- ~SentenceAlignment();
- int Create(const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global);
- // void clear() { delete(alignment); };
- void FindTunnels( const Global &global ) ;
-
- void CreateLattice(const Global &global);
- void CreateRules(const Global &global);
-
- const TunnelCollection &GetTunnelCollection() const
- {
- assert(m_tunnelCollection);
- return *m_tunnelCollection;
- }
-
- const Lattice &GetLattice() const
- {
- assert(m_lattice);
- return *m_lattice;
- }
-
-protected:
- TunnelCollection *m_tunnelCollection;
- Lattice *m_lattice;
-
- /*
- void CalcTightestSpan(Outer &tightest);
- void InitTightest(Outer &tightest, size_t len);
- Range &GetTightest(Outer &tightest, size_t startPos, size_t endPos);
- void SetAlignment(size_t source, size_t target);
- void SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos);
- */
-};
-
diff --git a/contrib/other-builds/extract-mixed-syntax/Symbol.cpp b/contrib/other-builds/extract-mixed-syntax/Symbol.cpp
deleted file mode 100644
index 0181dcaeb..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Symbol.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Symbol.cpp
- * extract
- *
- * Created by Hieu Hoang on 21/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <cassert>
-#include "Symbol.h"
-
-using namespace std;
-
-Symbol::Symbol(const std::string &label, size_t pos)
-:m_label(label)
-,m_isTerminal(true)
-,m_span(2)
-{
- m_span[0].first = pos;
-}
-
-Symbol::Symbol(const std::string &labelS, const std::string &labelT
- , size_t startS, size_t endS
- , size_t startT, size_t endT
- , bool isSourceSyntax, bool isTargetSyntax)
-:m_label(labelS)
-,m_labelT(labelT)
-,m_isTerminal(false)
-,m_span(2)
-,m_isSourceSyntax(isSourceSyntax)
-,m_isTargetSyntax(isTargetSyntax)
-{
- m_span[0] = std::pair<size_t, size_t>(startS, endS);
- m_span[1] = std::pair<size_t, size_t>(startT, endT);
-}
-
-int CompareNonTerm(bool thisIsSyntax, bool otherIsSyntax
- , const std::pair<size_t, size_t> &thisSpan, const std::pair<size_t, size_t> &otherSpan
- , std::string thisLabel, std::string otherLabel)
-{
- if (thisIsSyntax != otherIsSyntax)
- { // 1 is [X] & the other is [NP] on the source
- return thisIsSyntax ? -1 : +1;
- }
-
- assert(thisIsSyntax == otherIsSyntax);
- if (thisIsSyntax)
- { // compare span & label
- if (thisSpan != otherSpan)
- return thisSpan < otherSpan ? -1 : +1;
- if (thisLabel != otherLabel)
- return thisLabel < otherLabel ? -1 : +1;
- }
-
- return 0;
-}
-
-int Symbol::Compare(const Symbol &other) const
-{
- if (m_isTerminal != other.m_isTerminal)
- return m_isTerminal ? -1 : +1;
-
- assert(m_isTerminal == other.m_isTerminal);
- if (m_isTerminal)
- { // compare labels & pos
- if (m_span[0].first != other.m_span[0].first)
- return (m_span[0].first < other.m_span[0].first) ? -1 : +1;
-
- if (m_label != other.m_label)
- return (m_label < other.m_label) ? -1 : +1;
-
- }
- else
- { // non terms
- int ret = CompareNonTerm(m_isSourceSyntax, other.m_isSourceSyntax
- ,m_span[0], other.m_span[0]
- ,m_label, other.m_label);
- if (ret != 0)
- return ret;
-
- ret = CompareNonTerm(m_isTargetSyntax, other.m_isTargetSyntax
- ,m_span[1], other.m_span[1]
- ,m_label, other.m_label);
- if (ret != 0)
- return ret;
- }
-
- return 0;
-}
-
-
-std::ostream& operator<<(std::ostream &out, const Symbol &obj)
-{
- if (obj.m_isTerminal)
- out << obj.m_label;
- else
- out << obj.m_label + obj.m_labelT;
-
- return out;
-}
-
diff --git a/contrib/other-builds/extract-mixed-syntax/Symbol.h b/contrib/other-builds/extract-mixed-syntax/Symbol.h
deleted file mode 100644
index b79a705b2..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Symbol.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-/*
- * Symbol.h
- * extract
- *
- * Created by Hieu Hoang on 21/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <string>
-#include <iostream>
-#include <vector>
-
-class Symbol
-{
- friend std::ostream& operator<<(std::ostream &out, const Symbol &obj);
-
-protected:
- std::string m_label, m_labelT; // m_labelT only for non-term
- std::vector<std::pair<size_t, size_t> > m_span;
-
- bool m_isTerminal, m_isSourceSyntax, m_isTargetSyntax;
-public:
- // for terminals
- Symbol(const std::string &label, size_t pos);
-
- // for non-terminals
- Symbol(const std::string &labelS, const std::string &labelT
- , size_t startS, size_t endS
- , size_t startT, size_t endT
- , bool isSourceSyntax, bool isTargetSyntax);
-
- int Compare(const Symbol &other) const;
-
-};
diff --git a/contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp b/contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp
deleted file mode 100644
index 0cf19f664..000000000
--- a/contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * SymbolSequence.cpp
- * extract
- *
- * Created by Hieu Hoang on 21/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <cassert>
-#include <sstream>
-#include "SymbolSequence.h"
-
-using namespace std;
-
-int SymbolSequence::Compare(const SymbolSequence &other) const
-{
- int ret;
- size_t thisSize = GetSize();
- size_t otherSize = other.GetSize();
- if (thisSize != otherSize)
- {
- ret = (thisSize < otherSize) ? -1 : +1;
- return ret;
- }
- else
- {
- assert(thisSize == otherSize);
- for (size_t ind = 0; ind < thisSize; ++ind)
- {
- const Symbol &thisSymbol = GetSymbol(ind);
- const Symbol &otherSymbol = other.GetSymbol(ind);
- ret = thisSymbol.Compare(otherSymbol);
- if (ret != 0)
- {
- return ret;
- }
- }
- }
-
- assert(ret == 0);
- return ret;
-}
-
-std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj)
-{
- SymbolSequence::CollType::const_iterator iterSymbol;
- for (iterSymbol = obj.m_coll.begin(); iterSymbol != obj.m_coll.end(); ++iterSymbol)
- {
- const Symbol &symbol = *iterSymbol;
- out << symbol << " ";
- }
-
- return out;
-}
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/SymbolSequence.h b/contrib/other-builds/extract-mixed-syntax/SymbolSequence.h
deleted file mode 100644
index 997c24205..000000000
--- a/contrib/other-builds/extract-mixed-syntax/SymbolSequence.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#pragma once
-/*
- * SymbolSequence.h
- * extract
- *
- * Created by Hieu Hoang on 21/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <iostream>
-#include <vector>
-#include "Symbol.h"
-
-class SymbolSequence
-{
- friend std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj);
-
-protected:
- typedef std::vector<Symbol> CollType;
- CollType m_coll;
-
-public:
- typedef CollType::iterator iterator;
- typedef CollType::const_iterator const_iterator;
- const_iterator begin() const { return m_coll.begin(); }
- const_iterator end() const { return m_coll.end(); }
-
- void Add(const Symbol &symbol)
- {
- m_coll.push_back(symbol);
- }
- size_t GetSize() const
- { return m_coll.size(); }
- const Symbol &GetSymbol(size_t ind) const
- { return m_coll[ind]; }
-
- void Clear()
- { m_coll.clear(); }
-
- int Compare(const SymbolSequence &other) const;
-
-};
diff --git a/contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp b/contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp
deleted file mode 100644
index a6ba3de7b..000000000
--- a/contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2009 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-
-#include <iostream>
-#include <cassert>
-#include "SyntaxTree.h"
-//#include "extract.h"
-#include "Global.h"
-
-//extern const Global g_debug;
-extern const Global *g_global;
-
-using namespace std;
-
-bool SyntaxNode::IsSyntax() const
-{
- bool ret = GetLabel() != "[X]";
- return ret;
-}
-
-SyntaxTree::SyntaxTree()
-:m_defaultLHS(0,0, "[X]")
-{
- m_emptyNode.clear();
-}
-
-SyntaxTree::~SyntaxTree()
-{
- // loop through all m_nodes, delete them
- for(int i=0; i<m_nodes.size(); i++)
- {
- delete m_nodes[i];
- }
-}
-
-bool HasDuplicates(const SyntaxNodes &nodes)
-{
- string prevLabel;
- SyntaxNodes::const_iterator iter;
- for (iter = nodes.begin(); iter != nodes.end(); ++iter)
- {
- const SyntaxNode &node = **iter;
- string label = node.GetLabel();
- if (label == prevLabel)
- return true;
- }
- return false;
-}
-
-void SyntaxTree::AddNode( int startPos, int endPos, std::string label )
-{
- SyntaxNode* newNode = new SyntaxNode( startPos, endPos, "[" + label + "]");
- m_nodes.push_back( newNode );
-
- SyntaxNodes &nodesChart = m_index[ startPos ][ endPos ];
-
- if (!g_global->uppermostOnly)
- {
- nodesChart.push_back( newNode );
- //assert(!HasDuplicates(m_index[ startPos ][ endPos ]));
- }
- else
- {
- if (nodesChart.size() > 0)
- {
- assert(nodesChart.size() == 1);
- //delete nodes[0];
- nodesChart.resize(0);
- }
- assert(nodesChart.size() == 0);
- nodesChart.push_back( newNode );
- }
-}
-
-ParentNodes SyntaxTree::Parse() {
- ParentNodes parents;
-
- int size = m_index.size();
-
- // looping through all spans of size >= 2
- for( int length=2; length<=size; length++ )
- {
- for( int startPos = 0; startPos <= size-length; startPos++ )
- {
- if (HasNode( startPos, startPos+length-1 ))
- {
- // processing one (parent) span
-
- //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
- SplitPoints splitPoints;
- splitPoints.push_back( startPos );
- //std::cerr << " " << startPos;
-
- int first = 1;
- int covered = 0;
- while( covered < length )
- {
- // find largest covering subspan (child)
- // starting at last covered position
- for( int midPos=length-first; midPos>covered; midPos-- )
- {
- if( HasNode( startPos+covered, startPos+midPos-1 ) )
- {
- covered = midPos;
- splitPoints.push_back( startPos+covered );
- // std::cerr << " " << ( startPos+covered );
- first = 0;
- }
- }
- }
- // std::cerr << std::endl;
- parents.push_back( splitPoints );
- }
- }
- }
- return parents;
-}
-
-bool SyntaxTree::HasNode( int startPos, int endPos ) const
-{
- return GetNodes( startPos, endPos).size() > 0;
-}
-
-const SyntaxNodes &SyntaxTree::GetNodes( int startPos, int endPos ) const
-{
- SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
- if (startIndex == m_index.end() )
- return m_emptyNode;
-
- SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
- if (endIndex == startIndex->second.end())
- return m_emptyNode;
-
- return endIndex->second;
-}
-
-// for printing out tree
-std::string SyntaxTree::ToString() const
-{
- std::stringstream out;
- out << *this;
- return out.str();
-}
-
-void SyntaxTree::AddDefaultNonTerms(size_t phraseSize)
-{
- for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
- {
- for (size_t endPos = startPos; endPos < phraseSize; ++endPos)
- {
- AddNode(startPos, endPos, "X");
- }
- }
-}
-
-void SyntaxTree::AddDefaultNonTerms(bool isSyntax, bool mixed, size_t phraseSize)
-{
- if (isSyntax)
- {
- AddDefaultNonTerms(!mixed, phraseSize);
- }
- else
- { // add X everywhere
- AddDefaultNonTerms(phraseSize);
- }
-}
-
-void SyntaxTree::AddDefaultNonTerms(bool addEverywhere, size_t phraseSize)
-{
- //cerr << "GetNumWords()=" << GetNumWords() << endl;
- //assert(phraseSize == GetNumWords() || GetNumWords() == 1); // 1 if syntax sentence doesn't have any xml. TODO fix syntax tree obj
-
- for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
- {
- for (size_t endPos = startPos; endPos <= phraseSize; ++endPos)
- {
- const SyntaxNodes &nodes = GetNodes(startPos, endPos);
- if (!addEverywhere && nodes.size() > 0)
- { // only add if no label
- continue;
- }
- AddNode(startPos, endPos, "X");
- }
- }
-}
-
-const SyntaxNodes SyntaxTree::GetNodesForLHS( int startPos, int endPos ) const
-{
- SyntaxNodes ret(GetNodes(startPos, endPos));
-
- if (ret.size() == 0)
- ret.push_back(&m_defaultLHS);
-
- return ret;
-}
-
-std::ostream& operator<<(std::ostream& os, const SyntaxTree& t)
-{
- int size = t.m_index.size();
- for(size_t length=1; length<=size; length++)
- {
- for(size_t space=0; space<length; space++)
- {
- os << " ";
- }
- for(size_t start=0; start<=size-length; start++)
- {
-
- if (t.HasNode( start, start+(length-1) ))
- {
- std::string label = t.GetNodes( start, start+(length-1) )[0]->GetLabel() + "#######";
-
- os << label.substr(0,7) << " ";
- }
- else
- {
- os << "------- ";
- }
- }
- os << std::endl;
- }
- return os;
-}
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/SyntaxTree.h b/contrib/other-builds/extract-mixed-syntax/SyntaxTree.h
deleted file mode 100644
index 50a73a369..000000000
--- a/contrib/other-builds/extract-mixed-syntax/SyntaxTree.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#pragma once
-
-// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2009 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-#include <string>
-#include <vector>
-#include <map>
-#include <sstream>
-
-class SyntaxNode;
-
-typedef std::vector<const SyntaxNode*> SyntaxNodes;
-
-class SyntaxNode {
-protected:
- int m_start, m_end;
- std::string m_label;
- SyntaxNodes m_children;
- SyntaxNode* m_parent;
-public:
-SyntaxNode( int startPos, int endPos, const std::string &label)
- :m_start(startPos)
- ,m_end(endPos)
- ,m_label(label)
- {}
- int GetStart() const
- { return m_start; }
- int GetEnd() const
- { return m_end; }
- const std::string &GetLabel() const
- { return m_label; }
- bool IsSyntax() const;
-};
-
-
-typedef std::vector< int > SplitPoints;
-typedef std::vector< SplitPoints > ParentNodes;
-
-class SyntaxTree {
-protected:
- SyntaxNodes m_nodes;
- SyntaxNode* m_top;
- SyntaxNode m_defaultLHS;
-
- typedef std::map< int, SyntaxNodes > SyntaxTreeIndex2;
- typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
- typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
- typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
- SyntaxTreeIndex m_index;
- SyntaxNodes m_emptyNode;
-
- friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
-
-public:
- SyntaxTree();
- ~SyntaxTree();
-
- void AddNode( int startPos, int endPos, std::string label );
- ParentNodes Parse();
- bool HasNode( int startPos, int endPos ) const;
- const SyntaxNodes &GetNodes( int startPos, int endPos ) const;
- const SyntaxNodes &GetAllNodes() const { return m_nodes; } ;
- size_t GetNumWords() const { return m_index.size(); }
- std::string ToString() const;
-
- void AddDefaultNonTerms(bool isSyntax, bool addEverywhere, size_t phraseSize);
- void AddDefaultNonTerms(bool mixed, size_t phraseSize);
-
- void AddDefaultNonTerms(size_t phraseSize);
-
- const SyntaxNodes GetNodesForLHS( int startPos, int endPos ) const;
-
-};
-
-std::ostream& operator<<(std::ostream&, const SyntaxTree&);
-
diff --git a/contrib/other-builds/extract-mixed-syntax/Tunnel.cpp b/contrib/other-builds/extract-mixed-syntax/Tunnel.cpp
deleted file mode 100644
index fc4846c34..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Tunnel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Tunnel.cpp
- * extract
- *
- * Created by Hieu Hoang on 19/01/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-
-#include "Tunnel.h"
-
-
-int Tunnel::Compare(const Tunnel &other) const
-{
- int ret = m_sourceRange.Compare(other.m_sourceRange);
-
- if (ret != 0)
- return ret;
-
- ret = m_targetRange.Compare(other.m_targetRange);
-
- return ret;
-}
-
-int Tunnel::Compare(const Tunnel &other, size_t direction) const
-{
- const Range &thisRange = (direction == 0) ? m_sourceRange : m_targetRange;
- const Range &otherRange = (direction == 0) ? other.m_sourceRange : other.m_targetRange;
-
- int ret = thisRange.Compare(otherRange);
- return ret;
-}
-
-std::ostream& operator<<(std::ostream &out, const Tunnel &tunnel)
-{
- out << tunnel.m_sourceRange << "==>" << tunnel.m_targetRange;
- return out;
-}
diff --git a/contrib/other-builds/extract-mixed-syntax/Tunnel.h b/contrib/other-builds/extract-mixed-syntax/Tunnel.h
deleted file mode 100644
index 2659cca4a..000000000
--- a/contrib/other-builds/extract-mixed-syntax/Tunnel.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#pragma once
-
-/*
- * Tunnel.h
- * extract
- *
- * Created by Hieu Hoang on 19/01/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <vector>
-#include <cassert>
-#include <string>
-#include <iostream>
-#include "Range.h"
-
- // for unaligned source terminal
-
-class Tunnel
-{
- friend std::ostream& operator<<(std::ostream&, const Tunnel&);
-
-protected:
-
- Range m_sourceRange, m_targetRange;
-
-public:
- Tunnel()
- {}
-
- Tunnel(const Tunnel &copy)
- :m_sourceRange(copy.m_sourceRange)
- ,m_targetRange(copy.m_targetRange)
- {}
-
- Tunnel(const Range &sourceRange, const Range &targetRange)
- :m_sourceRange(sourceRange)
- ,m_targetRange(targetRange)
- {}
-
- const Range &GetRange(size_t direction) const
- { return (direction == 0) ? m_sourceRange : m_targetRange; }
-
- int Compare(const Tunnel &other) const;
- int Compare(const Tunnel &other, size_t direction) const;
-};
-
-typedef std::vector<Tunnel> TunnelList;
-
diff --git a/contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp b/contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp
deleted file mode 100644
index 228cc3070..000000000
--- a/contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * TunnelCollection.cpp
- * extract
- *
- * Created by Hieu Hoang on 19/01/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-
-#include "TunnelCollection.h"
-#include "Range.h"
-
-using namespace std;
-
-size_t TunnelCollection::NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const
-{
- assert(startPos <= endPos);
-
- if (direction == 0)
- assert(endPos < alignedCountS.size());
- else
- assert(endPos < alignedCountT.size());
-
- size_t ret = 0;
- for (size_t ind = startPos; ind <= endPos; ++ind)
- {
- if (direction == 0 && alignedCountS[ind] == 0)
- {
- ret++;
- }
- else if (direction == 1 && alignedCountT[ind] == 0)
- {
- ret++;
- }
-
- }
-
- return ret;
-}
-
-void TunnelCollection::Add(int startS, int endS, int startT, int endT)
-{
- // m_phraseExist[startS][endS - startS].push_back(Tunnel(startT, endT));
- m_coll[startS][endS - startS].push_back(Tunnel(Range(startS, endS), Range(startT, endT)));
-}
-
-
-std::ostream& operator<<(std::ostream &out, const TunnelCollection &TunnelCollection)
-{
- size_t size = TunnelCollection.GetSize();
-
- for (size_t startPos = 0; startPos < size; ++startPos)
- {
- for (size_t endPos = startPos; endPos < size; ++endPos)
- {
- const TunnelList &tunnelList = TunnelCollection.GetTunnels(startPos, endPos);
- TunnelList::const_iterator iter;
- for (iter = tunnelList.begin(); iter != tunnelList.end(); ++iter)
- {
- const Tunnel &tunnel = *iter;
- out << tunnel << " ";
-
- }
- }
- }
-
- return out;
-}
-
-
diff --git a/contrib/other-builds/extract-mixed-syntax/TunnelCollection.h b/contrib/other-builds/extract-mixed-syntax/TunnelCollection.h
deleted file mode 100644
index 547cbf814..000000000
--- a/contrib/other-builds/extract-mixed-syntax/TunnelCollection.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#pragma once
-/*
- * TunnelCollection.h
- * extract
- *
- * Created by Hieu Hoang on 19/01/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <vector>
-#include "Tunnel.h"
-
-// reposity of extracted phrase pairs
-// which are potential tunnels in larger phrase pairs
-class TunnelCollection
- {
- friend std::ostream& operator<<(std::ostream&, const TunnelCollection&);
-
- protected:
- std::vector< std::vector<TunnelList> > m_coll;
- // indexed by source pos. and source length
- // maps to list of tunnels where <int, int> are target pos
-
- public:
- std::vector<int> alignedCountS, alignedCountT;
-
- TunnelCollection(const TunnelCollection &);
-
- TunnelCollection(size_t size)
- :m_coll(size)
- {
- // size is the length of the source sentence
- for (size_t pos = 0; pos < size; ++pos)
- {
- // create empty tunnel lists
- std::vector<TunnelList> &endVec = m_coll[pos];
- endVec.resize(size - pos);
- }
- }
-
- void Add(int startS, int endS, int startT, int endT);
-
- //const TunnelList &GetTargetHoles(int startS, int endS) const
- //{
- // const TunnelList &targetHoles = m_phraseExist[startS][endS - startS];
- // return targetHoles;
- //}
- const TunnelList &GetTunnels(int startS, int endS) const
- {
- const TunnelList &sourceHoles = m_coll[startS][endS - startS];
- return sourceHoles;
- }
-
- const size_t GetSize() const
- { return m_coll.size(); }
-
- size_t NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const;
-
-
- };
-
diff --git a/contrib/other-builds/extract-mixed-syntax/XmlTree.cpp b/contrib/other-builds/extract-mixed-syntax/XmlTree.cpp
deleted file mode 100644
index 9145c8d1c..000000000
--- a/contrib/other-builds/extract-mixed-syntax/XmlTree.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2006 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-#include <vector>
-#include <string>
-#include <set>
-#include <iostream>
-#include <stdlib.h>
-#include "SyntaxTree.h"
-
-using namespace std;
-
-
-inline std::vector<std::string> Tokenize(const std::string& str,
- const std::string& delimiters = " \t")
-{
- std::vector<std::string> tokens;
- // Skip delimiters at beginning.
- std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
- // Find first "non-delimiter".
- std::string::size_type pos = str.find_first_of(delimiters, lastPos);
-
- while (std::string::npos != pos || std::string::npos != lastPos)
- {
- // Found a token, add it to the vector.
- tokens.push_back(str.substr(lastPos, pos - lastPos));
- // Skip delimiters. Note the "not_of"
- lastPos = str.find_first_not_of(delimiters, pos);
- // Find next "non-delimiter"
- pos = str.find_first_of(delimiters, lastPos);
- }
-
- return tokens;
-}
-
-const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
-{
- std::string res = str;
- res.erase(str.find_last_not_of(dropChars)+1);
- return res.erase(0, res.find_first_not_of(dropChars));
-}
-
-string ParseXmlTagAttribute(const string& tag,const string& attributeName){
- /*TODO deal with unescaping \"*/
- string tagOpen = attributeName + "=\"";
- size_t contentsStart = tag.find(tagOpen);
- if (contentsStart == string::npos) return "";
- contentsStart += tagOpen.size();
- size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
- if (contentsEnd == string::npos) {
- cerr << "Malformed XML attribute: "<< tag;
- return "";
- }
- size_t possibleEnd;
- while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
- contentsEnd = possibleEnd;
- }
- return tag.substr(contentsStart,contentsEnd-contentsStart);
-}
-
-/**
- * Remove "<" and ">" from XML tag
- *
- * \param str xml token to be stripped
- */
-string TrimXml(const string& str)
-{
- // too short to be xml token -> do nothing
- if (str.size() < 2) return str;
-
- // strip first and last character
- if (str[0] == '<' && str[str.size() - 1] == '>')
- {
- return str.substr(1, str.size() - 2);
- }
- // not an xml token -> do nothing
- else { return str; }
-}
-
-/**
- * Check if the token is an XML tag, i.e. starts with "<"
- *
- * \param tag token to be checked
- */
-bool isXmlTag(const string& tag)
-{
- return tag[0] == '<';
-}
-
-/**
- * Split up the input character string into tokens made up of
- * either XML tags or text.
- * example: this <b> is a </b> test .
- * => (this ), (<b>), ( is a ), (</b>), ( test .)
- *
- * \param str input string
- */
-inline vector<string> TokenizeXml(const string& str)
-{
- string lbrack = "<";
- string rbrack = ">";
- vector<string> tokens; // vector of tokens to be returned
- string::size_type cpos = 0; // current position in string
- string::size_type lpos = 0; // left start of xml tag
- string::size_type rpos = 0; // right end of xml tag
-
- // walk thorugh the string (loop vver cpos)
- while (cpos != str.size())
- {
- // find the next opening "<" of an xml tag
- lpos = str.find_first_of(lbrack, cpos);
- if (lpos != string::npos)
- {
- // find the end of the xml tag
- rpos = str.find_first_of(rbrack, lpos);
- // sanity check: there has to be closing ">"
- if (rpos == string::npos)
- {
- cerr << "ERROR: malformed XML: " << str << endl;
- return tokens;
- }
- }
- else // no more tags found
- {
- // add the rest as token
- tokens.push_back(str.substr(cpos));
- break;
- }
-
- // add stuff before xml tag as token, if there is any
- if (lpos - cpos > 0)
- tokens.push_back(str.substr(cpos, lpos - cpos));
-
- // add xml tag as token
- tokens.push_back(str.substr(lpos, rpos-lpos+1));
- cpos = rpos + 1;
- }
- return tokens;
-}
-
-/**
- * Process a sentence with xml annotation
- * Xml tags may specifiy additional/replacing translation options
- * and reordering constraints
- *
- * \param line in: sentence, out: sentence without the xml
- * \param res vector with translation options specified by xml
- * \param reorderingConstraint reordering constraint zones specified by xml
- * \param walls reordering constraint walls specified by xml
- */
-/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
- is so we can link things up afterwards. We can't create TranslationOptions as we
- parse because we don't have the completed source parsed until after this function
- removes all the markup from it (CreateFromString in Sentence::Read).
-*/
-bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) {
- //parse XML markup in translation line
-
- // no xml tag? we're done.
- if (line.find_first_of('<') == string::npos) { return true; }
-
- // break up input into a vector of xml tags and text
- // example: (this), (<b>), (is a), (</b>), (test .)
- vector<string> xmlTokens = TokenizeXml(line);
-
- // we need to store opened tags, until they are closed
- // tags are stored as tripled (tagname, startpos, contents)
- typedef pair< string, pair< size_t, string > > OpenedTag;
- vector< OpenedTag > tagStack; // stack that contains active opened tags
-
- string cleanLine; // return string (text without xml)
- size_t wordPos = 0; // position in sentence (in terms of number of words)
- bool isLinked = false;
-
- // loop through the tokens
- for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
- {
- // not a xml tag, but regular text (may contain many words)
- if(!isXmlTag(xmlTokens[xmlTokenPos]))
- {
- // add a space at boundary, if necessary
- if (cleanLine.size()>0 &&
- cleanLine[cleanLine.size() - 1] != ' ' &&
- xmlTokens[xmlTokenPos][0] != ' ')
- {
- cleanLine += " ";
- }
- cleanLine += xmlTokens[xmlTokenPos]; // add to output
- wordPos = Tokenize(cleanLine).size(); // count all the words
- }
-
- // process xml tag
- else
- {
- // *** get essential information about tag ***
-
- // strip extra boundary spaces and "<" and ">"
- string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
- // cerr << "XML TAG IS: " << tag << std::endl;
-
- if (tag.size() == 0)
- {
- cerr << "ERROR: empty tag name: " << line << endl;
- return false;
- }
-
- // check if unary (e.g., "<wall/>")
- bool isUnary = ( tag[tag.size() - 1] == '/' );
-
- // check if opening tag (e.g. "<a>", not "</a>")g
- bool isClosed = ( tag[0] == '/' );
- bool isOpen = !isClosed;
-
- if (isClosed && isUnary)
- {
- cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
- return false;
- }
-
- if (isClosed)
- tag = tag.substr(1); // remove "/" at the beginning
- if (isUnary)
- tag = tag.substr(0,tag.size()-1); // remove "/" at the end
-
- // find the tag name and contents
- string::size_type endOfName = tag.find_first_of(' ');
- string tagName = tag;
- string tagContent = "";
- if (endOfName != string::npos) {
- tagName = tag.substr(0,endOfName);
- tagContent = tag.substr(endOfName+1);
- }
-
- // *** process new tag ***
-
- if (isOpen || isUnary)
- {
- // put the tag on the tag stack
- OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
- tagStack.push_back( openedTag );
- // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
- }
-
- // *** process completed tag ***
-
- if (isClosed || isUnary)
- {
- // pop last opened tag from stack;
- if (tagStack.size() == 0)
- {
- cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
- return false;
- }
- OpenedTag openedTag = tagStack.back();
- tagStack.pop_back();
-
- // tag names have to match
- if (openedTag.first != tagName)
- {
- cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
- return false;
- }
-
- // assemble remaining information about tag
- size_t startPos = openedTag.second.first;
- string tagContent = openedTag.second.second;
- size_t endPos = wordPos;
-
- // span attribute overwrites position
- string span = ParseXmlTagAttribute(tagContent,"span");
- if (! span.empty())
- {
- vector<string> ij = Tokenize(span, "-");
- if (ij.size() != 1 && ij.size() != 2) {
- cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
- return false;
- }
- startPos = atoi(ij[0].c_str());
- if (ij.size() == 1) endPos = startPos + 1;
- else endPos = atoi(ij[1].c_str()) + 1;
- }
-
- // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
-
- if (startPos >= endPos)
- {
- cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
- return false;
- }
-
- string label = ParseXmlTagAttribute(tagContent,"label");
- labelCollection.insert( label );
-
- // report what we have processed so far
- if (0) {
- cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
- cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
- cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
- }
- tree.AddNode( startPos, endPos-1, label );
- }
- }
- }
- // we are done. check if there are tags that are still open
- if (tagStack.size() > 0)
- {
- cerr << "ERROR: some opened tags were never closed: " << line << endl;
- return false;
- }
-
- // collect top labels
- const SyntaxNodes &topNodes = tree.GetNodes( 0, wordPos-1 );
- for( SyntaxNodes::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ )
- {
- const SyntaxNode *n = *node;
- const string &label = n->GetLabel();
- if (topLabelCollection.find( label ) == topLabelCollection.end())
- topLabelCollection[ label ] = 0;
- topLabelCollection[ label ]++;
- }
-
- // return de-xml'ed sentence in line
- line = cleanLine;
- return true;
-}
diff --git a/contrib/other-builds/extract-mixed-syntax/extract.cpp b/contrib/other-builds/extract-mixed-syntax/extract.cpp
deleted file mode 100644
index 334a3e124..000000000
--- a/contrib/other-builds/extract-mixed-syntax/extract.cpp
+++ /dev/null
@@ -1,310 +0,0 @@
-// $Id: extract.cpp 2828 2010-02-01 16:07:58Z hieuhoang1972 $
-// vim:tabstop=2
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2009 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-#include <cstdio>
-#include <stdlib.h>
-#include <assert.h>
-#include <time.h>
-#include <cstring>
-#include <sstream>
-#include <iostream>
-#include "extract.h"
-#include "InputFileStream.h"
-#include "OutputFileStream.h"
-#include "Lattice.h"
-
-#ifdef WIN32
-// Include Visual Leak Detector
-#include <vld.h>
-#endif
-
-using namespace std;
-
-void writeGlueGrammar(const string &, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
-
-int main(int argc, char* argv[])
-{
- cerr << "Extract v2.0, written by Philipp Koehn\n"
- << "rule extraction from an aligned parallel corpus\n";
- //time_t starttime = time(NULL);
-
- Global *global = new Global();
- g_global = global;
- int sentenceOffset = 0;
-
- if (argc < 5) {
- cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract "
- << " [ --Hierarchical | --Orientation"
- << " | --GlueGrammar FILE | --UnknownWordLabel FILE"
- << " | --OnlyDirect"
-
- << " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]"
- << " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]"
- << " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]"
- << " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]"
-
- << " | --MaxSymbols[" << global->maxSymbols<< "]"
- << " | --MaxNonTerm[" << global->maxNonTerm << "]"
- << " | --SourceSyntax | --TargetSyntax"
- << " | --UppermostOnly[" << g_global->uppermostOnly << "]"
- << endl;
- exit(1);
- }
- char* &fileNameT = argv[1];
- char* &fileNameS = argv[2];
- char* &fileNameA = argv[3];
- string fileNameGlueGrammar;
- string fileNameUnknownWordLabel;
- string fileNameExtract = string(argv[4]);
-
- int optionInd = 5;
-
- for(int i=optionInd;i<argc;i++)
- {
- if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) {
- global->minHoleSpanSourceDefault = atoi(argv[++i]);
- if (global->minHoleSpanSourceDefault < 1) {
- cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl;
- exit(1);
- }
- }
- else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) {
- global->maxHoleSpanSourceDefault = atoi(argv[++i]);
- if (global->maxHoleSpanSourceDefault < 1) {
- cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl;
- exit(1);
- }
- }
- else if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) {
- global->minHoleSpanSourceSyntax = atoi(argv[++i]);
- if (global->minHoleSpanSourceSyntax < 1) {
- cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl;
- exit(1);
- }
- }
- else if (strcmp(argv[i],"--UppermostOnly") == 0) {
- global->uppermostOnly = atoi(argv[++i]);
- }
- else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) {
- global->maxHoleSpanSourceSyntax = atoi(argv[++i]);
- if (global->maxHoleSpanSourceSyntax < 1) {
- cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl;
- exit(1);
- }
- }
-
- // maximum number of words in hierarchical phrase
- else if (strcmp(argv[i],"--maxSymbols") == 0) {
- global->maxSymbols = atoi(argv[++i]);
- if (global->maxSymbols < 1) {
- cerr << "extract error: --maxSymbols should be at least 1" << endl;
- exit(1);
- }
- }
- // maximum number of non-terminals
- else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
- global->maxNonTerm = atoi(argv[++i]);
- if (global->maxNonTerm < 1) {
- cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
- exit(1);
- }
- }
- // allow consecutive non-terminals (X Y | X Y)
- else if (strcmp(argv[i],"--TargetSyntax") == 0) {
- global->targetSyntax = true;
- }
- else if (strcmp(argv[i],"--SourceSyntax") == 0) {
- global->sourceSyntax = true;
- }
- // do not create many part00xx files!
- else if (strcmp(argv[i],"--NoFileLimit") == 0) {
- // now default
- }
- else if (strcmp(argv[i],"--GlueGrammar") == 0) {
- global->glueGrammarFlag = true;
- if (++i >= argc)
- {
- cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
- exit(0);
- }
- fileNameGlueGrammar = string(argv[i]);
- cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
- }
- else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
- global->unknownWordLabelFlag = true;
- if (++i >= argc)
- {
- cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
- exit(0);
- }
- fileNameUnknownWordLabel = string(argv[i]);
- cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
- }
- // TODO: this should be a useful option
- //else if (strcmp(argv[i],"--ZipFiles") == 0) {
- // zipFiles = true;
- //}
- // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
- else if (strcmp(argv[i],"--Mixed") == 0) {
- global->mixed = true;
- }
- else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) {
- global->allowDefaultNonTermEdge = atoi(argv[++i]);
- }
- else if (strcmp(argv[i], "--GZOutput") == 0) {
- global->gzOutput = true;
- }
- else if (strcmp(argv[i],"--MaxSpan") == 0) {
- // ignore
- ++i;
- }
- else if (strcmp(argv[i],"--SentenceOffset") == 0) {
- if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
- cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
- exit(1);
- }
- sentenceOffset = atoi(argv[++i]);
- }
- else {
- cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
- exit(1);
- }
- }
-
-
- // open input files
- Moses::InputFileStream tFile(fileNameT);
- Moses::InputFileStream sFile(fileNameS);
- Moses::InputFileStream aFile(fileNameA);
-
- // open output files
- string fileNameExtractInv = fileNameExtract + ".inv";
- if (global->gzOutput) {
- fileNameExtract += ".gz";
- fileNameExtractInv += ".gz";
- }
-
- Moses::OutputFileStream extractFile;
- Moses::OutputFileStream extractFileInv;
- extractFile.Open(fileNameExtract.c_str());
- extractFileInv.Open(fileNameExtractInv.c_str());
-
-
- // loop through all sentence pairs
- int i = sentenceOffset;
- while(true) {
- i++;
-
- if (i % 1000 == 0) {
- cerr << i << " " << flush;
- }
-
- string targetString;
- string sourceString;
- string alignmentString;
-
- bool ok = getline(tFile, targetString);
- if (!ok)
- break;
- getline(sFile, sourceString);
- getline(aFile, alignmentString);
-
- //cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl;
-
- //time_t currTime = time(NULL);
- //cerr << "A " << (currTime - starttime) << endl;
-
- SentenceAlignment sentencePair;
- if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global ))
- {
- //cerr << sentence.sourceTree << endl;
- //cerr << sentence.targetTree << endl;
-
- sentencePair.FindTunnels(*g_global);
- //cerr << "C " << (time(NULL) - starttime) << endl;
- //cerr << sentencePair << endl;
-
- sentencePair.CreateLattice(*g_global);
- //cerr << "D " << (time(NULL) - starttime) << endl;
- //cerr << sentencePair << endl;
-
- sentencePair.CreateRules(*g_global);
- //cerr << "E " << (time(NULL) - starttime) << endl;
-
- //cerr << sentence.lattice->GetRules().GetSize() << endl;
- sentencePair.GetLattice().GetRules().Output(extractFile);
- sentencePair.GetLattice().GetRules().OutputInv(extractFileInv);
- }
- }
-
- tFile.Close();
- sFile.Close();
- aFile.Close();
-
- extractFile.Close();
- extractFileInv.Close();
-
- if (global->glueGrammarFlag) {
- writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection);
- }
-
- delete global;
-}
-
-
-void writeGlueGrammar( const string & fileName, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
-{
- ofstream grammarFile;
- grammarFile.open(fileName.c_str());
- if (!options.targetSyntax) {
- grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
- << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
- << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
- } else {
- // chose a top label that is not already a label
- string topLabel = "QQQQQQ";
- for( unsigned int i=1; i<=topLabel.length(); i++) {
- if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
- topLabel = topLabel.substr(0,i);
- break;
- }
- }
- // basic rules
- grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| " << endl
- << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << endl;
-
- // top rules
- for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
- i != targetTopLabelCollection.end(); i++ ) {
- grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << endl;
- }
-
- // glue rules
- for( set<string>::const_iterator i = targetLabelCollection.begin();
- i != targetLabelCollection.end(); i++ ) {
- grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << endl;
- }
- grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << endl; // glue rule for unknown word...
- }
- grammarFile.close();
-}
-
diff --git a/contrib/other-builds/extract-mixed-syntax/extract.h b/contrib/other-builds/extract-mixed-syntax/extract.h
deleted file mode 100644
index ac831f2d9..000000000
--- a/contrib/other-builds/extract-mixed-syntax/extract.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#pragma once
-
-#include <vector>
-#include <list>
-#include <map>
-#include <set>
-#include <string>
-#include <fstream>
-#include <algorithm>
-#include "SyntaxTree.h"
-#include "XmlTree.h"
-#include "Tunnel.h"
-#include "TunnelCollection.h"
-#include "SentenceAlignment.h"
-#include "Global.h"
-
-std::vector<std::string> tokenize( const char [] );
-
-#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
- _IS.getline(_LINE, _SIZE, _DELIM); \
- if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
- if (_IS.gcount() == _SIZE-1) { \
- cerr << "Line too long! Buffer overflow. Delete lines >=" \
- << _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \
- << endl; \
- exit(1); \
- } \
- }
-#define LINE_MAX_LENGTH 1000000
-
-const Global *g_global;
-
-std::set< std::string > targetLabelCollection, sourceLabelCollection;
-std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;
diff --git a/contrib/other-builds/extract-mixed-syntax/tables-core.cpp b/contrib/other-builds/extract-mixed-syntax/tables-core.cpp
deleted file mode 100644
index c3c141b7f..000000000
--- a/contrib/other-builds/extract-mixed-syntax/tables-core.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-// $Id: tables-core.cpp 3131 2010-04-13 16:29:55Z pjwilliams $
-//#include "beammain.h"
-//#include "SafeGetLine.h"
-#include "tables-core.h"
-
-#define TABLE_LINE_MAX_LENGTH 1000
-#define UNKNOWNSTR "UNK"
-
-// as in beamdecoder/tables.cpp
-vector<string> tokenize( const char* input ) {
- vector< string > token;
- bool betweenWords = true;
- int start=0;
- int i=0;
- for(; input[i] != '\0'; i++) {
- bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
- if (!isSpace && betweenWords) {
- start = i;
- betweenWords = false;
- }
- else if (isSpace && !betweenWords) {
- token.push_back( string( input+start, i-start ) );
- betweenWords = true;
- }
- }
- if (!betweenWords)
- token.push_back( string( input+start, i-start ) );
- return token;
-}
-
-WORD_ID Vocabulary::storeIfNew( const WORD& word ) {
- map<WORD, WORD_ID>::iterator i = lookup.find( word );
-
- if( i != lookup.end() )
- return i->second;
-
- WORD_ID id = vocab.size();
- vocab.push_back( word );
- lookup[ word ] = id;
- return id;
-}
-
-WORD_ID Vocabulary::getWordID( const WORD& word ) {
- map<WORD, WORD_ID>::iterator i = lookup.find( word );
- if( i == lookup.end() )
- return 0;
- return i->second;
-}
-
-PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase ) {
- map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
- if( i != lookup.end() )
- return i->second;
-
- PHRASE_ID id = phraseTable.size();
- phraseTable.push_back( phrase );
- lookup[ phrase ] = id;
- return id;
-}
-
-PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase ) {
- map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
- if( i == lookup.end() )
- return 0;
- return i->second;
-}
-
-void PhraseTable::clear() {
- lookup.clear();
- phraseTable.clear();
-}
-
-void DTable::init() {
- for(int i = -10; i<10; i++)
- dtable[i] = -abs( i );
-}
-
-/*
-void DTable::load( const string& fileName ) {
- ifstream inFile;
- inFile.open(fileName.c_str());
- istream *inFileP = &inFile;
-
- char line[TABLE_LINE_MAX_LENGTH];
- int i=0;
- while(true) {
- i++;
- SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
- if (inFileP->eof()) break;
-
- vector<string> token = tokenize( line );
- if (token.size() < 2) {
- cerr << "line " << i << " in " << fileName << " too short, skipping\n";
- continue;
- }
-
- int d = atoi( token[0].c_str() );
- double prob = log( atof( token[1].c_str() ) );
- dtable[ d ] = prob;
- }
-}
-*/
-
-double DTable::get( int distortion ) {
- if (dtable.find( distortion ) == dtable.end())
- return log( 0.00001 );
- return dtable[ distortion ];
-}
-
diff --git a/contrib/other-builds/extract-mixed-syntax/tables-core.h b/contrib/other-builds/extract-mixed-syntax/tables-core.h
deleted file mode 100644
index f039ced7e..000000000
--- a/contrib/other-builds/extract-mixed-syntax/tables-core.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#pragma once
-// $Id: tables-core.h 2416 2009-07-30 11:07:38Z hieuhoang1972 $
-
-#include <iostream>
-#include <fstream>
-#include <assert.h>
-#include <stdlib.h>
-#include <string>
-#include <queue>
-#include <map>
-#include <cmath>
-
-using namespace std;
-
-#define TABLE_LINE_MAX_LENGTH 1000
-#define UNKNOWNSTR "UNK"
-
-vector<string> tokenize( const char[] );
-
-//! delete and remove every element of a collection object such as map, set, list etc
-template<class COLL>
-void RemoveAllInColl(COLL &coll)
-{
- for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter)
- {
- delete (*iter);
- }
- coll.clear();
-}
-
-typedef string WORD;
-typedef unsigned int WORD_ID;
-
-class Vocabulary {
- public:
- map<WORD, WORD_ID> lookup;
- vector< WORD > vocab;
- WORD_ID storeIfNew( const WORD& );
- WORD_ID getWordID( const WORD& );
- inline WORD &getWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
-};
-
-typedef vector< WORD_ID > PHRASE;
-typedef unsigned int PHRASE_ID;
-
-class PhraseTable {
- public:
- map< PHRASE, PHRASE_ID > lookup;
- vector< PHRASE > phraseTable;
- PHRASE_ID storeIfNew( const PHRASE& );
- PHRASE_ID getPhraseID( const PHRASE& );
- void clear();
- inline PHRASE &getPhrase( const PHRASE_ID id ) { return phraseTable[ id ]; }
-};
-
-typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC;
-
-class TTable {
- public:
- map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable;
- map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti;
-};
-
-class DTable {
- public:
- map< int, double > dtable;
- void init();
- void load( const string& );
- double get( int );
-};
-
-
diff --git a/contrib/other-builds/extract-ordering/.cproject b/contrib/other-builds/extract-ordering/.cproject
deleted file mode 100644
index 1d4522e27..000000000
--- a/contrib/other-builds/extract-ordering/.cproject
+++ /dev/null
@@ -1,134 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
- <storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" moduleId="org.eclipse.cdt.core.settings" name="Debug">
- <externalSettings/>
- <extensions>
- <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- </extensions>
- </storageModule>
- <storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.499747849" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.798364121" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract-ordering}/Debug" id="cdt.managedbuild.builder.gnu.cross.1976289814" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1324749613" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.1750299246" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.719498215" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.251118848" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.99297656" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.106920816" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
- </option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1844372739" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1178164658" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <option id="gnu.cpp.link.option.libs.1434184833" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
- <listOptionValue builtIn="false" value="z"/>
- <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
- <listOptionValue builtIn="false" value="boost_system-mt"/>
- <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
- </option>
- <option id="gnu.cpp.link.option.paths.974811544" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
- </option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.904916320" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
- <additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
- <additionalInput kind="additionalinput" paths="$(LIBS)"/>
- </inputType>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.1005231499" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.1318928675" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.604255673" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
- </tool>
- </toolChain>
- </folderInfo>
- </configuration>
- </storageModule>
- <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
- </cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.818331963">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" moduleId="org.eclipse.cdt.core.settings" name="Release">
- <externalSettings/>
- <extensions>
- <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- </extensions>
- </storageModule>
- <storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.818331963." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.1489025499" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1052477856" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract-ordering}/Release" id="cdt.managedbuild.builder.gnu.cross.33925527" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1884790737" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" useByScannerDiscovery="false" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.197048136" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.106898878" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.1920378037" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.37950410" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1197641703" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1356351201" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2053623412" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
- <additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
- <additionalInput kind="additionalinput" paths="$(LIBS)"/>
- </inputType>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.1988048517" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.1494470963" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1553727957" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
- </tool>
- </toolChain>
- </folderInfo>
- </configuration>
- </storageModule>
- <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
- </cconfiguration>
- </storageModule>
- <storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="extract-ordering.cdt.managedbuild.target.gnu.cross.exe.1840421491" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
- </storageModule>
- <storageModule moduleId="scannerConfiguration">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417;cdt.managedbuild.tool.gnu.c.compiler.input.106898878">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446;cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964;cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827;cdt.managedbuild.tool.gnu.c.compiler.input.719498215">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
- </scannerConfigBuildInfo>
- </storageModule>
- <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="refreshScope" versionNumber="2">
- <configuration configurationName="Release">
- <resource resourceType="PROJECT" workspacePath="/extract-ordering"/>
- </configuration>
- <configuration configurationName="Debug">
- <resource resourceType="PROJECT" workspacePath="/extract-ordering"/>
- </configuration>
- </storageModule>
-</cproject>
diff --git a/contrib/other-builds/extract-rules/.cproject b/contrib/other-builds/extract-rules/.cproject
index c1fa1a0cb..e79f0f526 100644
--- a/contrib/other-builds/extract-rules/.cproject
+++ b/contrib/other-builds/extract-rules/.cproject
@@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1438215292">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1438215292" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.1909818145">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1909818145" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
@@ -14,45 +14,41 @@
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1438215292" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1438215292." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.124769989" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.266544803" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract-rules}/Debug" id="cdt.managedbuild.builder.gnu.cross.335858926" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1376077469" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.947547329" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.426953885" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.include.paths.1671695899" name="Include paths (-I)" superClass="gnu.c.compiler.option.include.paths"/>
- <option id="gnu.c.compiler.option.include.files.1838960067" name="Include files (-include)" superClass="gnu.c.compiler.option.include.files"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.985831394" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.53480540" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.1726371873" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.899893408" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.1099087456" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.1909818145" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1909818145." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.702289239" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.769221744" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/extract-rules}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1538811811" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.417385938" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.274036343" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1227466042" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.640603457" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.231971122" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/include&quot;"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.88958138" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.61884195" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1616232021" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1411857637" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <option id="gnu.cpp.link.option.libs.109133121" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
- <listOptionValue builtIn="false" value="z"/>
- <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
- <listOptionValue builtIn="false" value="boost_system-mt"/>
- <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
- </option>
- <option id="gnu.cpp.link.option.paths.1030374421" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.212337827" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.831633145" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.1948518292" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1036034505" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.982611610" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.165444158" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.paths.1351410350" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.272393234" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <option id="gnu.cpp.link.option.libs.1356683866" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
+ <listOptionValue builtIn="false" value="z"/>
+ </option>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1569179988" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.1391783790" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.2066621509" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1945638157" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1433595017" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.879628838" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -60,8 +56,8 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.1200693544">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1200693544" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.130284564">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.130284564" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
@@ -73,31 +69,31 @@
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.1200693544" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.1200693544." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.1113964425" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1722595316" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract-rules}/Release" id="cdt.managedbuild.builder.gnu.cross.691589832" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.593530229" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1320426973" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.947026588" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1217031668" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.130284564" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.130284564." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.933956450" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1114636926" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/extract-rules}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.1972638661" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1382194499" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.605692631" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.1543139461" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.307019882" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.771498068" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1401773863" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.1504181086" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.1645775798" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1484987112" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1332689416" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1372281360" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.2028047264" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1645644335" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1807515346" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.44234391" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1468234013" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.586184465" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.1438048814" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.733316869" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.467923425" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.1673313707" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.518252425" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.225998350" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1649512548" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -107,21 +103,21 @@
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="extract-rules.cdt.managedbuild.target.gnu.cross.exe.1916763759" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+ <project id="extract-rules.cdt.managedbuild.target.gnu.exe.1608401758" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1438215292;cdt.managedbuild.config.gnu.cross.exe.debug.1438215292.;cdt.managedbuild.tool.gnu.cross.c.compiler.1376077469;cdt.managedbuild.tool.gnu.c.compiler.input.985831394">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.130284564;cdt.managedbuild.config.gnu.exe.release.130284564.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1332689416;cdt.managedbuild.tool.gnu.c.compiler.input.1645644335">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1438215292;cdt.managedbuild.config.gnu.cross.exe.debug.1438215292.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.53480540;cdt.managedbuild.tool.gnu.cpp.compiler.input.88958138">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.130284564;cdt.managedbuild.config.gnu.exe.release.130284564.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.605692631;cdt.managedbuild.tool.gnu.cpp.compiler.input.771498068">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1200693544;cdt.managedbuild.config.gnu.cross.exe.release.1200693544.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1401773863;cdt.managedbuild.tool.gnu.cpp.compiler.input.1484987112">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1909818145;cdt.managedbuild.config.gnu.exe.debug.1909818145.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.212337827;cdt.managedbuild.tool.gnu.c.compiler.input.1036034505">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1200693544;cdt.managedbuild.config.gnu.cross.exe.release.1200693544.;cdt.managedbuild.tool.gnu.cross.c.compiler.593530229;cdt.managedbuild.tool.gnu.c.compiler.input.1217031668">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1909818145;cdt.managedbuild.config.gnu.exe.debug.1909818145.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.274036343;cdt.managedbuild.tool.gnu.cpp.compiler.input.61884195">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/extract-rules/.gitignore b/contrib/other-builds/extract-rules/.gitignore
deleted file mode 100644
index 98bbc3165..000000000
--- a/contrib/other-builds/extract-rules/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/Debug
diff --git a/contrib/other-builds/extract-rules/.project b/contrib/other-builds/extract-rules/.project
index 29ffed2a9..79b72a58a 100644
--- a/contrib/other-builds/extract-rules/.project
+++ b/contrib/other-builds/extract-rules/.project
@@ -26,11 +26,6 @@
</natures>
<linkedResources>
<link>
- <name>ExtractedRule.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/ExtractedRule.h</locationURI>
- </link>
- <link>
<name>Hole.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/Hole.h</locationURI>
@@ -66,11 +61,6 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
</link>
<link>
- <name>RuleExtractionOptions.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/RuleExtractionOptions.h</locationURI>
- </link>
- <link>
<name>SentenceAlignment.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.cpp</locationURI>
@@ -116,11 +106,6 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-rules-main.cpp</locationURI>
</link>
<link>
- <name>gzfilebuf.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/gzfilebuf.h</locationURI>
- </link>
- <link>
<name>tables-core.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.cpp</locationURI>
diff --git a/contrib/other-builds/extract/.cproject b/contrib/other-builds/extract/.cproject
index 83bc724c3..10701cb6e 100644
--- a/contrib/other-builds/extract/.cproject
+++ b/contrib/other-builds/extract/.cproject
@@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.386290689">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.386290689" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.2119725657">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2119725657" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
@@ -14,43 +14,42 @@
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.386290689" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.386290689." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.671913278" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1231657738" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract}/Debug" id="cdt.managedbuild.builder.gnu.cross.571044108" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.332036857" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1292572253" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.1873227592" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1165888615" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1342023600" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.698819695" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.1451916947" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.1702398011" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2119725657" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2119725657." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1708444053" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.645190133" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/extract}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1816006533" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.876593881" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1859867372" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1585316374" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.535775760" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.874182289" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.579278848" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1355287045" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1202195555" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.1840757183" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.876682032" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.676382830" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1856691234" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1699542791" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <option id="gnu.cpp.link.option.libs.1880730637" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.83617569" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.943560690" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.libs.599256050" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
<listOptionValue builtIn="false" value="z"/>
- <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
- <listOptionValue builtIn="false" value="boost_system-mt"/>
- <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
</option>
- <option id="gnu.cpp.link.option.paths.298225069" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <option id="gnu.cpp.link.option.paths.1223834298" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <listOptionValue builtIn="false" value=""/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1339210059" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1129315792" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.976825054" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.1971927463" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.704926167" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.942430539" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1676263707" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -58,8 +57,8 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.140124152">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.140124152" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.1230189043">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1230189043" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
@@ -71,31 +70,31 @@
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.140124152" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.140124152." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.1250240843" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.597335968" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/extract}/Release" id="cdt.managedbuild.builder.gnu.cross.95066247" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2096762162" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.88795016" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.383328020" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.681105644" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1230189043" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.1230189043." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.280378247" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1881910636" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/extract}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.872962284" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1342549060" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1229278587" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.509799885" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.682561415" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1043901368" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1806684544" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.553394848" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.1420596769" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1726759263" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1628542348" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1033362550" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.429156793" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.389761516" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.234409052" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.320346578" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2045242811" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.1299282565" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.461289078" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1586085606" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.417132714" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.1944597759" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.203400619" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.1190745343" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.221147938" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -105,21 +104,21 @@
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="extract.cdt.managedbuild.target.gnu.cross.exe.1220534104" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+ <project id="extract.cdt.managedbuild.target.gnu.exe.1053550598" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.140124152;cdt.managedbuild.config.gnu.cross.exe.release.140124152.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1806684544;cdt.managedbuild.tool.gnu.cpp.compiler.input.1726759263">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1230189043;cdt.managedbuild.config.gnu.exe.release.1230189043.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1628542348;cdt.managedbuild.tool.gnu.c.compiler.input.389761516">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.140124152;cdt.managedbuild.config.gnu.cross.exe.release.140124152.;cdt.managedbuild.tool.gnu.cross.c.compiler.2096762162;cdt.managedbuild.tool.gnu.c.compiler.input.681105644">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2119725657;cdt.managedbuild.config.gnu.exe.debug.2119725657.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1202195555;cdt.managedbuild.tool.gnu.c.compiler.input.676382830">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.386290689;cdt.managedbuild.config.gnu.cross.exe.debug.386290689.;cdt.managedbuild.tool.gnu.cross.c.compiler.332036857;cdt.managedbuild.tool.gnu.c.compiler.input.1165888615">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1230189043;cdt.managedbuild.config.gnu.exe.release.1230189043.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1229278587;cdt.managedbuild.tool.gnu.cpp.compiler.input.1043901368">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.386290689;cdt.managedbuild.config.gnu.cross.exe.debug.386290689.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1342023600;cdt.managedbuild.tool.gnu.cpp.compiler.input.579278848">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2119725657;cdt.managedbuild.config.gnu.exe.debug.2119725657.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1859867372;cdt.managedbuild.tool.gnu.cpp.compiler.input.1355287045">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject
index 5f0b24ef0..613c41d5c 100644
--- a/contrib/other-builds/extractor/.cproject
+++ b/contrib/other-builds/extractor/.cproject
@@ -5,12 +5,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1133345948" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -25,6 +25,7 @@
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.966722418" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
@@ -42,9 +43,12 @@
</option>
<option id="gnu.cpp.link.option.libs.585257079" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="mert_lib"/>
- <listOptionValue builtIn="false" value="boost_system-mt"/>
<listOptionValue builtIn="false" value="util"/>
+ <listOptionValue builtIn="false" value="boost_system"/>
+ <listOptionValue builtIn="false" value="boost_thread"/>
<listOptionValue builtIn="false" value="z"/>
+ <listOptionValue builtIn="false" value="pthread"/>
+ <listOptionValue builtIn="false" value="rt"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.656319745" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
@@ -64,12 +68,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1385955159" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -134,4 +138,5 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
+ <storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>
diff --git a/contrib/other-builds/extractor/.project b/contrib/other-builds/extractor/.project
index e4fe08579..56d560019 100644
--- a/contrib/other-builds/extractor/.project
+++ b/contrib/other-builds/extractor/.project
@@ -4,6 +4,7 @@
<comment></comment>
<projects>
<project>mert_lib</project>
+ <project>util</project>
</projects>
<buildSpec>
<buildCommand>
diff --git a/contrib/other-builds/lm/.cproject b/contrib/other-builds/lm/.cproject
index e3e47fd7e..c2dad0f8d 100644
--- a/contrib/other-builds/lm/.cproject
+++ b/contrib/other-builds/lm/.cproject
@@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -59,8 +59,18 @@
</tool>
</toolChain>
</folderInfo>
+ <folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750.38452119" name="/" resourcePath="wrappers">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1621748368" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug" unusedChildren="">
+ <tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.2002161718" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468"/>
+ <tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.2138497585" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.macosx.exe.debug.86927135" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.macosx.exe.debug.62265891"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.macosx.base.315991018" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.macosx.base.775866405"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1319557326" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1024092140"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.1042051280" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.34201722"/>
+ </toolChain>
+ </folderInfo>
<sourceEntries>
- <entry excluding="left_test.cc|model_test.cc" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+ <entry excluding="wrappers|left_test.cc|model_test.cc" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
@@ -70,13 +80,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.203229648" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
diff --git a/contrib/other-builds/lm/.project b/contrib/other-builds/lm/.project
index 9498bb19e..7cc135fc6 100644
--- a/contrib/other-builds/lm/.project
+++ b/contrib/other-builds/lm/.project
@@ -87,6 +87,11 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/.DS_Store</locationURI>
</link>
<link>
+ <name>CMakeLists.txt</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/CMakeLists.txt</locationURI>
+ </link>
+ <link>
<name>COPYING</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/COPYING</locationURI>
@@ -122,6 +127,11 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/bhiksha.hh</locationURI>
</link>
<link>
+ <name>bin</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>binary_format.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/binary_format.cc</locationURI>
@@ -142,6 +152,16 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI>
</link>
<link>
+ <name>build_binary_main.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/build_binary_main.cc</locationURI>
+ </link>
+ <link>
+ <name>builder</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>clean.sh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/clean.sh</locationURI>
@@ -172,6 +192,16 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI>
</link>
<link>
+ <name>filter</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>fragment_main.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/fragment_main.cc</locationURI>
+ </link>
+ <link>
<name>left.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/left.hh</locationURI>
@@ -257,6 +287,11 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/query</locationURI>
</link>
<link>
+ <name>query_main.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/query_main.cc</locationURI>
+ </link>
+ <link>
<name>read_arpa.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/read_arpa.cc</locationURI>
@@ -292,6 +327,16 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/search_trie.hh</locationURI>
</link>
<link>
+ <name>sizes.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/sizes.cc</locationURI>
+ </link>
+ <link>
+ <name>sizes.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/sizes.hh</locationURI>
+ </link>
+ <link>
<name>state.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/state.hh</locationURI>
@@ -376,5 +421,990 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/word_index.hh</locationURI>
</link>
+ <link>
+ <name>wrappers</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/order.log</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/order.log</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/Jamfile</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/Jamfile</locationURI>
+ </link>
+ <link>
+ <name>builder/README.md</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/README.md</locationURI>
+ </link>
+ <link>
+ <name>builder/TODO</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/TODO</locationURI>
+ </link>
+ <link>
+ <name>builder/adjust_counts.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/adjust_counts.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/adjust_counts.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/adjust_counts.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/adjust_counts_test.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/adjust_counts_test.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/bin</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/corpus_count.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/corpus_count.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/corpus_count.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/corpus_count.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/corpus_count_test.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/corpus_count_test.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/discount.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/discount.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/dump_counts_main.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/dump_counts_main.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/hash_gamma.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/hash_gamma.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/header_info.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/header_info.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/initial_probabilities.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/initial_probabilities.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/initial_probabilities.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/initial_probabilities.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/interpolate.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/interpolate.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/interpolate.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/interpolate.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/joint_order.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/joint_order.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/lmplz_main.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/lmplz_main.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/ngram.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/ngram.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/ngram_stream.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/ngram_stream.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/pipeline.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/pipeline.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/pipeline.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/pipeline.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/print.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/print.cc</locationURI>
+ </link>
+ <link>
+ <name>builder/print.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/print.hh</locationURI>
+ </link>
+ <link>
+ <name>builder/sort.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/sort.hh</locationURI>
+ </link>
+ <link>
+ <name>filter/Jamfile</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/Jamfile</locationURI>
+ </link>
+ <link>
+ <name>filter/arpa_io.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/arpa_io.cc</locationURI>
+ </link>
+ <link>
+ <name>filter/arpa_io.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/arpa_io.hh</locationURI>
+ </link>
+ <link>
+ <name>filter/bin</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/count_io.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/count_io.hh</locationURI>
+ </link>
+ <link>
+ <name>filter/filter_main.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/filter_main.cc</locationURI>
+ </link>
+ <link>
+ <name>filter/format.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/format.hh</locationURI>
+ </link>
+ <link>
+ <name>filter/phrase.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/phrase.cc</locationURI>
+ </link>
+ <link>
+ <name>filter/phrase.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/phrase.hh</locationURI>
+ </link>
+ <link>
+ <name>filter/phrase_table_vocab_main.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/phrase_table_vocab_main.cc</locationURI>
+ </link>
+ <link>
+ <name>filter/thread.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/thread.hh</locationURI>
+ </link>
+ <link>
+ <name>filter/vocab.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/vocab.cc</locationURI>
+ </link>
+ <link>
+ <name>filter/vocab.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/vocab.hh</locationURI>
+ </link>
+ <link>
+ <name>filter/wrapper.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/wrapper.hh</locationURI>
+ </link>
+ <link>
+ <name>wrappers/README</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/wrappers/README</locationURI>
+ </link>
+ <link>
+ <name>wrappers/nplm.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/wrappers/nplm.cc</locationURI>
+ </link>
+ <link>
+ <name>wrappers/nplm.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/wrappers/nplm.hh</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/bhiksha.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/bhiksha.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/binary_format.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/binary_format.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/build_binary</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/build_binary</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/build_binary_main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/build_binary_main.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/config.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/config.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/fragment</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/fragment</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/fragment_main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/fragment_main.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lm_exception.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lm_exception.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/quantize.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/quantize.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/query</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/query</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/query_main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/query_main.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/read_arpa.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/read_arpa.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/search_hashed.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/search_hashed.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/search_trie.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/search_trie.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/sizes.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/sizes.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/trie.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/trie.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/trie_sort.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/trie_sort.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/value_build.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/value_build.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/virtual_interface.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/virtual_interface.o</locationURI>
+ </link>
+ <link>
+ <name>bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocab.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocab.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/bhiksha.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/bhiksha.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/binary_format.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/binary_format.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/build_binary</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/build_binary</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/build_binary_main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/build_binary_main.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/config.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/config.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/fragment</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/fragment</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/fragment_main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/fragment_main.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lm_exception.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lm_exception.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/quantize.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/quantize.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/query</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/query</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/query_main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/query_main.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/read_arpa.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/read_arpa.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/search_hashed.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/search_hashed.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/search_trie.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/search_trie.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/sizes.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/sizes.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/trie.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/trie.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/trie_sort.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/trie_sort.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/value_build.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/value_build.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/virtual_interface.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/virtual_interface.o</locationURI>
+ </link>
+ <link>
+ <name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocab.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocab.o</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.o</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.output</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.output</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.run</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.run</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.test</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/left_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.test</locationURI>
+ </link>
+ <link>
+ <name>bin/left_test.test/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/left_test.test/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/left_test.o</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.o</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.output</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.output</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.run</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.run</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.test</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/model_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.test</locationURI>
+ </link>
+ <link>
+ <name>bin/model_test.test/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/model_test.test/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/model_test.o</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.o</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.output</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.output</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.run</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.run</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.test</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/partial_test.test/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.test</locationURI>
+ </link>
+ <link>
+ <name>bin/partial_test.test/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/bin/partial_test.test/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/partial_test.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/adjust_counts.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/adjust_counts.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/corpus_count.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/corpus_count.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/dump_counts</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/dump_counts</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/dump_counts_main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/dump_counts_main.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/initial_probabilities.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/initial_probabilities.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/interpolate.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/interpolate.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lmplz</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lmplz</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lmplz_main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lmplz_main.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/pipeline.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/pipeline.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/print.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/print.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/adjust_counts.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/adjust_counts.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/corpus_count.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/corpus_count.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/initial_probabilities.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/initial_probabilities.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/interpolate.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/interpolate.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lmplz</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lmplz</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lmplz_main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/lmplz_main.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/pipeline.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/pipeline.o</locationURI>
+ </link>
+ <link>
+ <name>builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/print.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/builder/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/print.o</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/arpa_io.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/arpa_io.o</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/filter</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/filter</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/main.o</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/phrase.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/phrase.o</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocab.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocab.o</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/arpa_io.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/arpa_io.o</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/filter</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/filter</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/main.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/main.o</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/phrase.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/phrase.o</locationURI>
+ </link>
+ <link>
+ <name>filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocab.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/lm/filter/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocab.o</locationURI>
+ </link>
</linkedResources>
</projectDescription>
diff --git a/contrib/other-builds/manual-label/.cproject b/contrib/other-builds/manual-label/.cproject
index 2efd96e70..d9297a9fc 100644
--- a/contrib/other-builds/manual-label/.cproject
+++ b/contrib/other-builds/manual-label/.cproject
@@ -1,54 +1,54 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.2107801703">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2107801703" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.1899954923" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1645930772" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/manual-label/Debug}" id="cdt.managedbuild.builder.gnu.cross.1703642277" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1888648788" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.1838052643" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.798368516" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.153015988" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.418888584" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.406065865" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2107801703" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2107801703." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.502948364" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.1431969079" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/manual-label}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.2101075234" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1118840081" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2037265673" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.400985496" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1160903812" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.404589863" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <listOptionValue builtIn="false" value="${workspace_loc:}/../.."/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost&quot;"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.967940596" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.789243964" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.2033266575" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.1568929819" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.676866714" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1741441821" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1626431978" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <option id="gnu.cpp.link.option.libs.1886912770" superClass="gnu.cpp.link.option.libs" valueType="libs">
- <listOptionValue builtIn="false" value="boost_program_options-mt"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.254144861" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.319879082" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.paths.132164474" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
</option>
- <option id="gnu.cpp.link.option.paths.1541583695" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/lib64&quot;"/>
+ <option id="gnu.cpp.link.option.libs.1017214824" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <listOptionValue builtIn="false" value="boost_program_options"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1367999206" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1672776758" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.31522559" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.826957235" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.350181339" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1104732611" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.372096550" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -56,44 +56,44 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.649050588">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.649050588" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.97427761" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.564169339" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/manual-label/Release}" id="cdt.managedbuild.builder.gnu.cross.663164336" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.2135645103" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.764935013" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1841809129" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.649050588" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.649050588." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1107402972" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1038954684" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/manual-label}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.100518450" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.2005888378" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1743303968" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.968169340" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.977676916" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1889240027" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.1877584345" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.935490779" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.924128295" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1914416581" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.826081780" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2048171432" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.355530813" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.940299092" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.17718999" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.940327646" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.369758737" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1186766936" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.1527322008" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.480337803" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1788533940" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.266174128" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.558116084" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -103,22 +103,30 @@
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="manual-label.cdt.managedbuild.target.gnu.cross.exe.2117548180" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+ <project id="manual-label.cdt.managedbuild.target.gnu.exe.1701243340" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943;cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.649050588;cdt.managedbuild.config.gnu.exe.release.649050588.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1743303968;cdt.managedbuild.tool.gnu.cpp.compiler.input.1889240027">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607;cdt.managedbuild.tool.gnu.c.compiler.input.798368516">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.649050588;cdt.managedbuild.config.gnu.exe.release.649050588.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.924128295;cdt.managedbuild.tool.gnu.c.compiler.input.2048171432">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437;cdt.managedbuild.tool.gnu.c.compiler.input.1841809129">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2107801703;cdt.managedbuild.config.gnu.exe.debug.2107801703.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2037265673;cdt.managedbuild.tool.gnu.cpp.compiler.input.967940596">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503;cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2107801703;cdt.managedbuild.config.gnu.exe.debug.2107801703.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.789243964;cdt.managedbuild.tool.gnu.c.compiler.input.676866714">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/manual-label"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/manual-label"/>
+ </configuration>
+ </storageModule>
</cproject>
diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp
index 7ef9d495d..ea2934c5a 100644
--- a/contrib/other-builds/manual-label/DeEn.cpp
+++ b/contrib/other-builds/manual-label/DeEn.cpp
@@ -1,30 +1,12 @@
#include <list>
#include "DeEn.h"
+#include "Main.h"
#include "moses/Util.h"
using namespace std;
extern bool g_debug;
-bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str)
-{
- pos += offset;
- if (pos >= source.size() || pos < 0) {
- return false;
- }
-
- const string &word = source[pos][factor];
- vector<string> soughts = Moses::Tokenize(str, " ");
- for (int i = 0; i < soughts.size(); ++i) {
- string &sought = soughts[i];
- bool found = (word == sought);
- if (found) {
- return true;
- }
- }
- return false;
-}
-
bool Contains(const Phrase &source, int start, int end, int factor, const string &str)
{
for (int pos = start; pos <= end; ++pos) {
@@ -38,8 +20,6 @@ bool Contains(const Phrase &source, int start, int end, int factor, const string
void LabelDeEn(const Phrase &source, ostream &out)
{
- typedef pair<int,int> Range;
- typedef list<Range> Ranges;
Ranges ranges;
// find ranges to label
@@ -48,39 +28,19 @@ void LabelDeEn(const Phrase &source, ostream &out)
if (IsA(source, start, -1, 1, "VAFIN")
&& IsA(source, end, +1, 1, "VVINF VVPP")
&& !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) {
- Range range(start, end);
+ Range range(start, end, "reorder-label");
ranges.push_back(range);
}
else if ((start == 0 || IsA(source, start, -1, 1, "$,"))
&& IsA(source, end, +1, 0, "zu")
&& IsA(source, end, +2, 1, "VVINF")
&& !Contains(source, start, end, 1, "$,")) {
- Range range(start, end);
+ Range range(start, end, "reorder-label");
ranges.push_back(range);
}
}
}
- // output sentence, with labels
- for (int pos = 0; pos < source.size(); ++pos) {
- // output beginning of label
- for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
- const Range &range = *iter;
- if (range.first == pos) {
- out << "<tree label=\"reorder-label\"> ";
- }
- }
-
- const Word &word = source[pos];
- out << word[0] << " ";
-
- for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
- const Range &range = *iter;
- if (range.second == pos) {
- out << "</tree> ";
- }
- }
- }
- out << endl;
-
+ OutputWithLabels(source, ranges, out);
}
+
diff --git a/contrib/other-builds/manual-label/DeEn.h b/contrib/other-builds/manual-label/DeEn.h
index 999c2dfbd..c24ce0079 100644
--- a/contrib/other-builds/manual-label/DeEn.h
+++ b/contrib/other-builds/manual-label/DeEn.h
@@ -1,10 +1,5 @@
#pragma once
-#include <iostream>
-#include <vector>
-#include <string>
-
-typedef std::vector<std::string> Word;
-typedef std::vector<Word> Phrase;
+#include "Main.h"
void LabelDeEn(const Phrase &source, std::ostream &out);
diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
new file mode 100644
index 000000000..67c2e9d84
--- /dev/null
+++ b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
@@ -0,0 +1,201 @@
+/*
+ * EnApacheChunker.cpp
+ *
+ * Created on: 28 Feb 2014
+ * Author: hieu
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <algorithm>
+#include <fstream>
+#include "EnOpenNLPChunker.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+EnOpenNLPChunker::EnOpenNLPChunker(const std::string &openNLPPath)
+:m_openNLPPath(openNLPPath)
+{
+ // TODO Auto-generated constructor stub
+
+}
+
+EnOpenNLPChunker::~EnOpenNLPChunker() {
+ // TODO Auto-generated destructor stub
+}
+
+void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector<string> &filterList)
+{
+ // read all input to a temp file
+ char *ptr = tmpnam(NULL);
+ string inStr(ptr);
+ ofstream inFile(ptr);
+
+ string line;
+ while (getline(in, line)) {
+ Unescape(line);
+ inFile << line << endl;
+ }
+ inFile.close();
+
+ ptr = tmpnam(NULL);
+ string outStr(ptr);
+
+ // execute chunker
+ string cmd = "cat " + inStr + " | "
+ + m_openNLPPath + "/bin/opennlp POSTagger "
+ + m_openNLPPath + "/models/en-pos-maxent.bin | "
+ + m_openNLPPath + "/bin/opennlp ChunkerME "
+ + m_openNLPPath + "/models/en-chunker.bin > "
+ + outStr;
+ //g << "Executing:" << cmd << endl;
+ int ret = system(cmd.c_str());
+
+ // read result of chunker and output as Moses xml trees
+ ifstream outFile(outStr.c_str());
+
+ size_t lineNum = 0;
+ while (getline(outFile, line)) {
+ //cerr << line << endl;
+ MosesReformat(line, out, filterList);
+ out << endl;
+ ++lineNum;
+ }
+ outFile.close();
+
+ // clean up temporary files
+ remove(inStr.c_str());
+ remove(outStr.c_str());
+}
+
+void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector<string> &filterList)
+{
+ //cerr << "REFORMATING:" << line << endl;
+ bool inLabel = false;
+ vector<string> toks;
+ Moses::Tokenize(toks, line);
+ for (size_t i = 0; i < toks.size(); ++i) {
+ const string &tok = toks[i];
+
+ if (tok.substr(0, 1) == "[" && tok.substr(1,1) != "_") {
+ // start of chunk
+ string label = tok.substr(1);
+ if (UseLabel(label, filterList)) {
+ out << "<tree label=\"" << label << "\">";
+ inLabel = true;
+ }
+ }
+ else if (tok.substr(tok.size()-1, 1) == "]") {
+ // end of chunk
+ if (tok.size() > 1) {
+ if (tok.substr(1,1) == "_") {
+ // just a word that happens to be ]
+ vector<string> factors;
+ Moses::Tokenize(factors, tok, "_");
+ assert(factors.size() == 2);
+
+ Escape(factors[0]);
+ out << factors[0] << " ";
+ }
+ else {
+ // a word and end of tree
+ string word = tok.substr(0, tok.size()-1);
+
+ vector<string> factors;
+ Moses::Tokenize(factors, word, "_");
+ assert(factors.size() == 2);
+
+ Escape(factors[0]);
+ out << factors[0] << " ";
+ }
+
+ if (inLabel) {
+ out << "</tree> ";
+ inLabel = false;
+ }
+ }
+ else {
+ if (inLabel) {
+ out << "</tree> ";
+ inLabel = false;
+ }
+ }
+
+ }
+ else {
+ // lexical item
+ vector<string> factors;
+ Moses::Tokenize(factors, tok, "_");
+ if (factors.size() == 2) {
+ Escape(factors[0]);
+ out << factors[0] << " ";
+ }
+ else if (factors.size() == 1) {
+ // word is _
+ assert(tok.substr(0, 2) == "__");
+ out << "_ ";
+ }
+ else {
+ throw "Unknown format:" + tok;
+ }
+ }
+ }
+}
+
+std::string
+replaceAll( std::string const& original,
+ std::string const& before,
+ std::string const& after )
+{
+ std::string retval;
+ std::string::const_iterator end = original.end();
+ std::string::const_iterator current = original.begin();
+ std::string::const_iterator next =
+ std::search( current, end, before.begin(), before.end() );
+ while ( next != end ) {
+ retval.append( current, next );
+ retval.append( after );
+ current = next + before.size();
+ next = std::search( current, end, before.begin(), before.end() );
+ }
+ retval.append( current, next );
+ return retval;
+}
+
+void EnOpenNLPChunker::Escape(string &line)
+{
+ line = replaceAll(line, "&", "&amp;");
+ line = replaceAll(line, "|", "&#124;");
+ line = replaceAll(line, "<", "&lt;");
+ line = replaceAll(line, ">", "&gt;");
+ line = replaceAll(line, "'", "&apos;");
+ line = replaceAll(line, "\"", "&quot;");
+ line = replaceAll(line, "[", "&#91;");
+ line = replaceAll(line, "]", "&#93;");
+}
+
+void EnOpenNLPChunker::Unescape(string &line)
+{
+ line = replaceAll(line, "&#124;", "|");
+ line = replaceAll(line, "&lt;", "<");
+ line = replaceAll(line, "&gt;", ">");
+ line = replaceAll(line, "&quot;", "\"");
+ line = replaceAll(line, "&apos;", "'");
+ line = replaceAll(line, "&#91;", "[");
+ line = replaceAll(line, "&#93;", "]");
+ line = replaceAll(line, "&amp;", "&");
+}
+
+bool EnOpenNLPChunker::UseLabel(const std::string &label, const std::vector<std::string> &filterList) const
+{
+ if (filterList.size() == 0) {
+ return true;
+ }
+
+ for (size_t i = 0; i < filterList.size(); ++i) {
+ if (label == filterList[i]) {
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.h b/contrib/other-builds/manual-label/EnOpenNLPChunker.h
new file mode 100644
index 000000000..df9f90e42
--- /dev/null
+++ b/contrib/other-builds/manual-label/EnOpenNLPChunker.h
@@ -0,0 +1,29 @@
+/*
+ * EnApacheChunker.h
+ *
+ * Created on: 28 Feb 2014
+ * Author: hieu
+ */
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <iostream>
+
+class EnOpenNLPChunker {
+public:
+ EnOpenNLPChunker(const std::string &openNLPPath);
+ virtual ~EnOpenNLPChunker();
+ void Process(std::istream &in, std::ostream &out, const std::vector<std::string> &filterList);
+protected:
+ const std::string m_openNLPPath;
+
+ void Escape(std::string &line);
+ void Unescape(std::string &line);
+
+ void MosesReformat(const std::string &line, std::ostream &out, const std::vector<std::string> &filterList);
+
+ bool UseLabel(const std::string &label, const std::vector<std::string> &filterList) const;
+};
+
diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp b/contrib/other-builds/manual-label/EnPhrasalVerb.cpp
new file mode 100644
index 000000000..4bee9b941
--- /dev/null
+++ b/contrib/other-builds/manual-label/EnPhrasalVerb.cpp
@@ -0,0 +1,226 @@
+#include <iostream>
+#include <list>
+#include <limits>
+#include <algorithm>
+#include "EnPhrasalVerb.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+void EnPhrasalVerb(const Phrase &source, int revision, ostream &out)
+{
+ Ranges ranges;
+
+ // find ranges to label
+ for (int start = 0; start < source.size(); ++start) {
+ size_t end = std::numeric_limits<size_t>::max();
+
+ if (IsA(source, start, 0, 0, "ask asked asking")) {
+ end = Found(source, start, 0, "out");
+ }
+ else if (IsA(source, start, 0, 0, "back backed backing")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "blow blown blew")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "break broke broken")) {
+ end = Found(source, start, 0, "down up in");
+ }
+ else if (IsA(source, start, 0, 0, "bring brought bringing")) {
+ end = Found(source, start, 0, "down up in");
+ }
+ else if (IsA(source, start, 0, 0, "call called calling")) {
+ end = Found(source, start, 0, "back up off");
+ }
+ else if (IsA(source, start, 0, 0, "check checked checking")) {
+ end = Found(source, start, 0, "out in");
+ }
+ else if (IsA(source, start, 0, 0, "cheer cheered cheering")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "clean cleaned cleaning")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "cross crossed crossing")) {
+ end = Found(source, start, 0, "out");
+ }
+ else if (IsA(source, start, 0, 0, "cut cutting")) {
+ end = Found(source, start, 0, "down off out");
+ }
+ else if (IsA(source, start, 0, 0, "do did done")) {
+ end = Found(source, start, 0, "over up");
+ }
+ else if (IsA(source, start, 0, 0, "drop dropped dropping")) {
+ end = Found(source, start, 0, "off");
+ }
+ else if (IsA(source, start, 0, 0, "figure figured figuring")) {
+ end = Found(source, start, 0, "out");
+ }
+ else if (IsA(source, start, 0, 0, "fill filled filling")) {
+ end = Found(source, start, 0, "in out up");
+ }
+ else if (IsA(source, start, 0, 0, "find found finding")) {
+ end = Found(source, start, 0, "out");
+ }
+ else if (IsA(source, start, 0, 0, "get got getting gotten")) {
+ end = Found(source, start, 0, "across over back");
+ }
+ else if (IsA(source, start, 0, 0, "give given gave giving")) {
+ end = Found(source, start, 0, "away back out up");
+ }
+ else if (IsA(source, start, 0, 0, "hand handed handing")) {
+ end = Found(source, start, 0, "down in over");
+ }
+ else if (IsA(source, start, 0, 0, "hold held holding")) {
+ end = Found(source, start, 0, "back up");
+ }
+ else if (IsA(source, start, 0, 0, "keep kept keeping")) {
+ end = Found(source, start, 0, "from up");
+ }
+ else if (IsA(source, start, 0, 0, "let letting")) {
+ end = Found(source, start, 0, "down in");
+ }
+ else if (IsA(source, start, 0, 0, "look looked looking")) {
+ end = Found(source, start, 0, "over up");
+ }
+ else if (IsA(source, start, 0, 0, "make made making")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "mix mixed mixing")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "pass passed passing")) {
+ end = Found(source, start, 0, "out up");
+ }
+ else if (IsA(source, start, 0, 0, "pay payed paying")) {
+ end = Found(source, start, 0, "back");
+ }
+ else if (IsA(source, start, 0, 0, "pick picked picking")) {
+ end = Found(source, start, 0, "out");
+ }
+ else if (IsA(source, start, 0, 0, "point pointed pointing")) {
+ end = Found(source, start, 0, "out");
+ }
+ else if (IsA(source, start, 0, 0, "put putting")) {
+ end = Found(source, start, 0, "down off out together on");
+ }
+ else if (IsA(source, start, 0, 0, "send sending")) {
+ end = Found(source, start, 0, "back");
+ }
+ else if (IsA(source, start, 0, 0, "set setting")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "sort sorted sorting")) {
+ end = Found(source, start, 0, "out");
+ }
+ else if (IsA(source, start, 0, 0, "switch switched switching")) {
+ end = Found(source, start, 0, "off on");
+ }
+ else if (IsA(source, start, 0, 0, "take took taking")) {
+ end = Found(source, start, 0, "apart back off out");
+ }
+ else if (IsA(source, start, 0, 0, "tear torn tearing")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "think thought thinking")) {
+ end = Found(source, start, 0, "over");
+ }
+ else if (IsA(source, start, 0, 0, "thrown threw thrown throwing")) {
+ end = Found(source, start, 0, "away");
+ }
+ else if (IsA(source, start, 0, 0, "turn turned turning")) {
+ end = Found(source, start, 0, "down off on");
+ }
+ else if (IsA(source, start, 0, 0, "try tried trying")) {
+ end = Found(source, start, 0, "on out");
+ }
+ else if (IsA(source, start, 0, 0, "use used using")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "warm warmed warming")) {
+ end = Found(source, start, 0, "up");
+ }
+ else if (IsA(source, start, 0, 0, "work worked working")) {
+ end = Found(source, start, 0, "out");
+ }
+
+ // found range to label
+ if (end != std::numeric_limits<size_t>::max() &&
+ end > start + 1) {
+ bool add = true;
+ if (revision == 1 && Exist(source,
+ start + 1,
+ end - 1,
+ 1,
+ "VB VBD VBG VBN VBP VBZ")) {
+ // there's a verb in between
+ add = false;
+ }
+
+ if (add) {
+ Range range(start + 1, end - 1, "reorder-label");
+ ranges.push_back(range);
+ }
+ }
+ }
+
+ OutputWithLabels(source, ranges, out);
+}
+
+bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str)
+{
+ vector<string> soughts = Moses::Tokenize(str, " ");
+ for (size_t i = start; i <= end; ++i) {
+ const Word &word = source[i];
+ bool found = Found(word, factor, soughts);
+ if (found) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+size_t Found(const Phrase &source, int pos, int factor, const std::string &str)
+{
+ const size_t MAX_RANGE = 10;
+
+ vector<string> soughts = Moses::Tokenize(str, " ");
+ vector<string> puncts = Moses::Tokenize(". : , ;", " ");
+
+
+ size_t maxEnd = std::min(source.size(), (size_t) pos + MAX_RANGE);
+ for (size_t i = pos + 1; i < maxEnd; ++i) {
+ const Word &word = source[i];
+ bool found;
+
+ found = Found(word, factor, puncts);
+ if (found) {
+ return std::numeric_limits<size_t>::max();
+ }
+
+ found = Found(word, factor, soughts);
+ if (found) {
+ return i;
+ }
+ }
+
+ return std::numeric_limits<size_t>::max();
+}
+
+
+bool Found(const Word &word, int factor, const vector<string> &soughts)
+{
+ const string &element = word[factor];
+ for (size_t i = 0; i < soughts.size(); ++i) {
+ const string &sought = soughts[i];
+ bool found = (element == sought);
+ if (found) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.h b/contrib/other-builds/manual-label/EnPhrasalVerb.h
new file mode 100644
index 000000000..4cb5f7348
--- /dev/null
+++ b/contrib/other-builds/manual-label/EnPhrasalVerb.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "Main.h"
+
+// roll your own identification of phrasal verbs
+void EnPhrasalVerb(const Phrase &source, int revision, std::ostream &out);
+
+bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str);
+size_t Found(const Phrase &source, int pos, int factor, const std::string &str);
+bool Found(const Word &word, int factor, const std::vector<std::string> &soughts);
+
diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp b/contrib/other-builds/manual-label/LabelByInitialLetter.cpp
new file mode 100644
index 000000000..e4136a7ea
--- /dev/null
+++ b/contrib/other-builds/manual-label/LabelByInitialLetter.cpp
@@ -0,0 +1,29 @@
+#include "LabelByInitialLetter.h"
+#include "Main.h"
+
+using namespace std;
+
+void LabelByInitialLetter(const Phrase &source, std::ostream &out)
+{
+ Ranges ranges;
+
+ for (int start = 0; start < source.size(); ++start) {
+ const string &startWord = source[start][0];
+ string startChar = startWord.substr(0,1);
+
+ for (int end = start + 1; end < source.size(); ++end) {
+ const string &endWord = source[end][0];
+ string endChar = endWord.substr(0,1);
+
+ if (startChar == endChar) {
+ Range range(start, end, startChar + "-label");
+ ranges.push_back(range);
+ }
+ }
+ }
+
+ OutputWithLabels(source, ranges, out);
+
+}
+
+
diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.h b/contrib/other-builds/manual-label/LabelByInitialLetter.h
new file mode 100644
index 000000000..ba8d34c19
--- /dev/null
+++ b/contrib/other-builds/manual-label/LabelByInitialLetter.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "Main.h"
+
+void LabelByInitialLetter(const Phrase &source, std::ostream &out);
+
diff --git a/contrib/other-builds/manual-label/Main.cpp b/contrib/other-builds/manual-label/Main.cpp
new file mode 100644
index 000000000..896f70590
--- /dev/null
+++ b/contrib/other-builds/manual-label/Main.cpp
@@ -0,0 +1,195 @@
+#include <iostream>
+#include <cstdlib>
+#include <boost/program_options.hpp>
+#include "moses/Util.h"
+#include "Main.h"
+#include "DeEn.h"
+#include "EnPhrasalVerb.h"
+#include "EnOpenNLPChunker.h"
+#include "LabelByInitialLetter.h"
+
+using namespace std;
+
+bool g_debug = false;
+
+Phrase Tokenize(const string &line);
+
+int main(int argc, char** argv)
+{
+ cerr << "Starting" << endl;
+
+ namespace po = boost::program_options;
+ po::options_description desc("Options");
+ desc.add_options()
+ ("help", "Print help messages")
+
+ ("input,i", po::value<string>(), "Input file. Otherwise it will read from standard in")
+ ("output,o", po::value<string>(), "Output file. Otherwise it will print from standard out")
+
+ ("source-language,s", po::value<string>()->required(), "Source Language")
+ ("target-language,t", po::value<string>()->required(), "Target Language")
+ ("revision,r", po::value<int>()->default_value(0), "Revision")
+ ("filter", po::value<string>(), "Only use labels from this comma-separated list")
+
+ ("opennlp", po::value<string>()->default_value(""), "Path to Apache OpenNLP toolkit")
+
+ ;
+
+ po::variables_map vm;
+ try
+ {
+ po::store(po::parse_command_line(argc, argv, desc),
+ vm); // can throw
+
+ /** --help option
+ */
+ if ( vm.count("help") )
+ {
+ std::cout << "Basic Command Line Parameter App" << std::endl
+ << desc << std::endl;
+ return EXIT_SUCCESS;
+ }
+
+ po::notify(vm); // throws on error, so do after help in case
+ // there are any problems
+ }
+ catch(po::error& e)
+ {
+ std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
+ std::cerr << desc << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ istream *inStrm = &cin;
+ if (vm.count("input")) {
+ string inStr = vm["input"].as<string>();
+ cerr << "inStr=" << inStr << endl;
+ ifstream *inFile = new ifstream(inStr.c_str());
+ inStrm = inFile;
+ }
+
+ ostream *outStrm = &cout;
+ if (vm.count("output")) {
+ string outStr = vm["output"].as<string>();
+ cerr << "outStr=" << outStr << endl;
+ ostream *outFile = new ofstream(outStr.c_str());
+ outStrm = outFile;
+ }
+
+ vector<string> filterList;
+ if (vm.count("filter")) {
+ string filter = vm["filter"].as<string>();
+ Moses::Tokenize(filterList, filter, ",");
+ }
+
+ string sourceLang = vm["source-language"].as<string>();
+ string targetLang = vm["target-language"].as<string>();
+ int revision = vm["revision"].as<int>();
+
+ cerr << sourceLang << " " << targetLang << " " << revision << endl;
+
+ if (sourceLang == "en" && revision == 2) {
+ if (vm.count("opennlp") == 0) {
+ throw "Need path to openNLP toolkit";
+ }
+
+ string openNLPPath = vm["opennlp"].as<string>();
+ EnOpenNLPChunker chunker(openNLPPath);
+ chunker.Process(*inStrm, *outStrm, filterList);
+ }
+ else {
+ // process line-by-line
+ string line;
+ size_t lineNum = 1;
+
+ while (getline(*inStrm, line)) {
+ //cerr << lineNum << ":" << line << endl;
+ if (lineNum % 1000 == 0) {
+ cerr << lineNum << " ";
+ }
+
+ Phrase source = Tokenize(line);
+
+ if (revision == 600 ) {
+ LabelByInitialLetter(source, *outStrm);
+ }
+ else if (sourceLang == "de" && targetLang == "en") {
+ LabelDeEn(source, *outStrm);
+ }
+ else if (sourceLang == "en") {
+ if (revision == 0 || revision == 1) {
+ EnPhrasalVerb(source, revision, *outStrm);
+ }
+ else if (revision == 2) {
+ string openNLPPath = vm["opennlp-path"].as<string>();
+ EnOpenNLPChunker chunker(openNLPPath);
+ }
+ }
+
+ ++lineNum;
+ }
+ }
+
+
+ cerr << "Finished" << endl;
+ return EXIT_SUCCESS;
+}
+
+Phrase Tokenize(const string &line)
+{
+ Phrase ret;
+
+ vector<string> toks = Moses::Tokenize(line);
+ for (size_t i = 0; i < toks.size(); ++i) {
+ Word word = Moses::Tokenize(toks[i], "|");
+ ret.push_back(word);
+ }
+
+ return ret;
+}
+
+bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str)
+{
+ pos += offset;
+ if (pos >= source.size() || pos < 0) {
+ return false;
+ }
+
+ const string &word = source[pos][factor];
+ vector<string> soughts = Moses::Tokenize(str, " ");
+ for (int i = 0; i < soughts.size(); ++i) {
+ string &sought = soughts[i];
+ bool found = (word == sought);
+ if (found) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
+void OutputWithLabels(const Phrase &source, const Ranges ranges, ostream &out)
+{
+ // output sentence, with labels
+ for (int pos = 0; pos < source.size(); ++pos) {
+ // output beginning of label
+ for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
+ const Range &range = *iter;
+ if (range.range.first == pos) {
+ out << "<tree label=\"" + range.label + "\"> ";
+ }
+ }
+
+ const Word &word = source[pos];
+ out << word[0] << " ";
+
+ for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
+ const Range &range = *iter;
+ if (range.range.second == pos) {
+ out << "</tree> ";
+ }
+ }
+ }
+ out << endl;
+
+}
diff --git a/contrib/other-builds/manual-label/Main.h b/contrib/other-builds/manual-label/Main.h
new file mode 100644
index 000000000..036da0d45
--- /dev/null
+++ b/contrib/other-builds/manual-label/Main.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <list>
+
+typedef std::vector<std::string> Word;
+typedef std::vector<Word> Phrase;
+
+struct Range
+{
+ Range(int start,int end, const std::string &l)
+ :range(start, end)
+ ,label(l)
+ {}
+
+ std::pair<int,int> range;
+ std::string label;
+};
+
+typedef std::list<Range> Ranges;
+
+bool IsA(const Phrase &source, int pos, int offset, int factor, const std::string &str);
+void OutputWithLabels(const Phrase &source, const Ranges ranges, std::ostream &out);
+
+
diff --git a/contrib/other-builds/manual-label/Makefile b/contrib/other-builds/manual-label/Makefile
index 60ce975cd..f24d69dc7 100644
--- a/contrib/other-builds/manual-label/Makefile
+++ b/contrib/other-builds/manual-label/Makefile
@@ -4,10 +4,11 @@ clean:
rm -f *.o manual-label
.cpp.o:
- g++ -I../../../ -O6 -g -c $<
+ g++ -I../../../boost/include -I../../../ -O3 -g -c $<
-manual-label: DeEn.o manual-label.o
+OBJECTS = DeEn.o EnOpenNLPChunker.o EnPhrasalVerb.o Main.o LabelByInitialLetter.o
- g++ DeEn.o manual-label.o -lz -lboost_program_options-mt -o manual-label
+manual-label: $(OBJECTS)
+ g++ $(OBJECTS) -L../../../boost/lib64 -lz -lboost_program_options-mt -o manual-label
diff --git a/contrib/other-builds/manual-label/manual-label.cpp b/contrib/other-builds/manual-label/manual-label.cpp
deleted file mode 100644
index 4500d2c84..000000000
--- a/contrib/other-builds/manual-label/manual-label.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <boost/program_options.hpp>
-#include "moses/Util.h"
-#include "DeEn.h"
-
-using namespace std;
-
-bool g_debug = false;
-
-Phrase Tokenize(const string &line);
-
-int main(int argc, char** argv)
-{
- cerr << "Starting" << endl;
-
- namespace po = boost::program_options;
- po::options_description desc("Options");
- desc.add_options()
- ("help", "Print help messages")
- ("add", "additional options")
- ("source-language,s", po::value<string>()->required(), "Source Language")
- ("target-language,t", po::value<string>()->required(), "Target Language");
-
- po::variables_map vm;
- try
- {
- po::store(po::parse_command_line(argc, argv, desc),
- vm); // can throw
-
- /** --help option
- */
- if ( vm.count("help") )
- {
- std::cout << "Basic Command Line Parameter App" << std::endl
- << desc << std::endl;
- return EXIT_SUCCESS;
- }
-
- po::notify(vm); // throws on error, so do after help in case
- // there are any problems
- }
- catch(po::error& e)
- {
- std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
- std::cerr << desc << std::endl;
- return EXIT_FAILURE;
- }
-
- string sourceLang = vm["source-language"].as<string>();
- string targetLang = vm["target-language"].as<string>();
- cerr << sourceLang << " " << targetLang << endl;
-
- string line;
- size_t lineNum = 1;
-
- while (getline(cin, line)) {
- //cerr << lineNum << ":" << line << endl;
- if (lineNum % 1000 == 0) {
- cerr << lineNum << " ";
- }
-
- Phrase source = Tokenize(line);
-
- LabelDeEn(source, cout);
-
- ++lineNum;
- }
-
-
-
- cerr << "Finished" << endl;
- return EXIT_SUCCESS;
-}
-
-Phrase Tokenize(const string &line)
-{
- Phrase ret;
-
- vector<string> toks = Moses::Tokenize(line);
- for (size_t i = 0; i < toks.size(); ++i) {
- Word word = Moses::Tokenize(toks[i], "|");
- ret.push_back(word);
- }
-
- return ret;
-}
-
diff --git a/contrib/other-builds/mert_lib/.project b/contrib/other-builds/mert_lib/.project
index 9f7476874..8c5b742b0 100644
--- a/contrib/other-builds/mert_lib/.project
+++ b/contrib/other-builds/mert_lib/.project
@@ -202,6 +202,16 @@
<locationURI>PARENT-3-PROJECT_LOC/mert/GzFileBuf.h</locationURI>
</link>
<link>
+ <name>HwcmScorer.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/mert/HwcmScorer.cpp</locationURI>
+ </link>
+ <link>
+ <name>HwcmScorer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/mert/HwcmScorer.h</locationURI>
+ </link>
+ <link>
<name>HypPackEnumerator.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/HypPackEnumerator.cpp</locationURI>
@@ -212,6 +222,16 @@
<locationURI>PARENT-3-PROJECT_LOC/mert/HypPackEnumerator.h</locationURI>
</link>
<link>
+ <name>InternalTree.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/InternalTree.cpp</locationURI>
+ </link>
+ <link>
+ <name>InternalTree.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/InternalTree.h</locationURI>
+ </link>
+ <link>
<name>InterpolatedScorer.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/InterpolatedScorer.cpp</locationURI>
diff --git a/contrib/other-builds/mira/.cproject b/contrib/other-builds/mira/.cproject
deleted file mode 100644
index 72f66b5fb..000000000
--- a/contrib/other-builds/mira/.cproject
+++ /dev/null
@@ -1,176 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
- <storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1385309092">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1385309092" moduleId="org.eclipse.cdt.core.settings" name="Debug">
- <externalSettings/>
- <extensions>
- <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- </extensions>
- </storageModule>
- <storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1385309092" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1385309092." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.377583226" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.2071063316" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/mira/Debug}" id="cdt.managedbuild.builder.gnu.cross.881204887" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1218877049" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1094111510" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.2142370493" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1560615310" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool command="g++" id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.115638939" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.1315998281" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.778416356" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.preprocessor.def.317569168" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
- <listOptionValue builtIn="false" value="HAVE_BOOST"/>
- <listOptionValue builtIn="false" value="MAX_NUM_FACTORS=4"/>
- <listOptionValue builtIn="false" value="TRACE_ENABLE"/>
- <listOptionValue builtIn="false" value="WITH_THREADS"/>
- </option>
- <option id="gnu.cpp.compiler.option.include.paths.1743631842" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
- </option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1454738757" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1480777831" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.485611005" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <option id="gnu.cpp.link.option.libs.1007486529" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
- <listOptionValue builtIn="false" value="moses"/>
- <listOptionValue builtIn="false" value="irstlm"/>
- <listOptionValue builtIn="false" value="dstruct"/>
- <listOptionValue builtIn="false" value="flm"/>
- <listOptionValue builtIn="false" value="oolm"/>
- <listOptionValue builtIn="false" value="lattice"/>
- <listOptionValue builtIn="false" value="misc"/>
- <listOptionValue builtIn="false" value="dalm"/>
- <listOptionValue builtIn="false" value="search"/>
- <listOptionValue builtIn="false" value="RandLM"/>
- <listOptionValue builtIn="false" value="OnDiskPt"/>
- <listOptionValue builtIn="false" value="lm"/>
- <listOptionValue builtIn="false" value="util"/>
- <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
- <listOptionValue builtIn="false" value="boost_system-mt"/>
- <listOptionValue builtIn="false" value="boost_thread-mt"/>
- <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
- <listOptionValue builtIn="false" value="boost_program_options-mt"/>
- <listOptionValue builtIn="false" value="pthread"/>
- <listOptionValue builtIn="false" value="z"/>
- <listOptionValue builtIn="false" value="bz2"/>
- <listOptionValue builtIn="false" value="dl"/>
- <listOptionValue builtIn="false" value="rt"/>
- </option>
- <option id="gnu.cpp.link.option.paths.132082917" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../DALM/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../nplm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../randlm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../cmph/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/OnDiskPt/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/search/Debug&quot;"/>
- <listOptionValue builtIn="false" value="/opt/local/lib"/>
- </option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1827477602" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
- <additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
- <additionalInput kind="additionalinput" paths="$(LIBS)"/>
- </inputType>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.1554055737" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.1335019965" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1106765201" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
- </tool>
- </toolChain>
- </folderInfo>
- </configuration>
- </storageModule>
- <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
- </cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.2038764866">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.2038764866" moduleId="org.eclipse.cdt.core.settings" name="Release">
- <externalSettings/>
- <extensions>
- <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- </extensions>
- </storageModule>
- <storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.2038764866" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.2038764866." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.1722081106" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.36030994" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/mira/Release}" id="cdt.managedbuild.builder.gnu.cross.329863268" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.299271422" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1049770857" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.1354488968" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.674520633" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.568828285" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.1042930447" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.305563840" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1424960921" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.460791828" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.945282347" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.561813601" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
- <additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
- <additionalInput kind="additionalinput" paths="$(LIBS)"/>
- </inputType>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.1813861310" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.991451934" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1702585996" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
- </tool>
- </toolChain>
- </folderInfo>
- </configuration>
- </storageModule>
- <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
- </cconfiguration>
- </storageModule>
- <storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="mira.cdt.managedbuild.target.gnu.cross.exe.1862989567" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
- </storageModule>
- <storageModule moduleId="scannerConfiguration">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1385309092;cdt.managedbuild.config.gnu.cross.exe.debug.1385309092.;cdt.managedbuild.tool.gnu.cross.c.compiler.1218877049;cdt.managedbuild.tool.gnu.c.compiler.input.1560615310">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.2038764866;cdt.managedbuild.config.gnu.cross.exe.release.2038764866.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.568828285;cdt.managedbuild.tool.gnu.cpp.compiler.input.1424960921">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1385309092;cdt.managedbuild.config.gnu.cross.exe.debug.1385309092.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.115638939;cdt.managedbuild.tool.gnu.cpp.compiler.input.1454738757">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.2038764866;cdt.managedbuild.config.gnu.cross.exe.release.2038764866.;cdt.managedbuild.tool.gnu.cross.c.compiler.299271422;cdt.managedbuild.tool.gnu.c.compiler.input.674520633">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
- </scannerConfigBuildInfo>
- </storageModule>
- <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="refreshScope" versionNumber="2">
- <configuration configurationName="Release">
- <resource resourceType="PROJECT" workspacePath="/mira"/>
- </configuration>
- <configuration configurationName="Debug">
- <resource resourceType="PROJECT" workspacePath="/mira"/>
- </configuration>
- </storageModule>
- <storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
- <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
-</cproject>
diff --git a/contrib/other-builds/mira/.project b/contrib/other-builds/mira/.project
deleted file mode 100644
index 03838731f..000000000
--- a/contrib/other-builds/mira/.project
+++ /dev/null
@@ -1,81 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
- <name>mira</name>
- <comment></comment>
- <projects>
- <project>mert_lib</project>
- <project>moses</project>
- </projects>
- <buildSpec>
- <buildCommand>
- <name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
- <triggers>clean,full,incremental,</triggers>
- <arguments>
- </arguments>
- </buildCommand>
- <buildCommand>
- <name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
- <triggers>full,incremental,</triggers>
- <arguments>
- </arguments>
- </buildCommand>
- </buildSpec>
- <natures>
- <nature>org.eclipse.cdt.core.cnature</nature>
- <nature>org.eclipse.cdt.core.ccnature</nature>
- <nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
- <nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
- </natures>
- <linkedResources>
- <link>
- <name>Decoder.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/Decoder.cpp</locationURI>
- </link>
- <link>
- <name>Decoder.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/Decoder.h</locationURI>
- </link>
- <link>
- <name>Hildreth.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/Hildreth.cpp</locationURI>
- </link>
- <link>
- <name>Hildreth.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/Hildreth.h</locationURI>
- </link>
- <link>
- <name>HypothesisQueue.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/HypothesisQueue.cpp</locationURI>
- </link>
- <link>
- <name>HypothesisQueue.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/HypothesisQueue.h</locationURI>
- </link>
- <link>
- <name>Main.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/Main.cpp</locationURI>
- </link>
- <link>
- <name>Main.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/Main.h</locationURI>
- </link>
- <link>
- <name>MiraOptimiser.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/MiraOptimiser.cpp</locationURI>
- </link>
- <link>
- <name>Perceptron.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/mira/Perceptron.cpp</locationURI>
- </link>
- </linkedResources>
-</projectDescription>
diff --git a/contrib/other-builds/moses-chart-cmd.vcxproj b/contrib/other-builds/moses-chart-cmd.vcxproj
deleted file mode 100644
index 25fe74588..000000000
--- a/contrib/other-builds/moses-chart-cmd.vcxproj
+++ /dev/null
@@ -1,115 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
- <ItemGroup Label="ProjectConfigurations">
- <ProjectConfiguration Include="Debug|Win32">
- <Configuration>Debug</Configuration>
- <Platform>Win32</Platform>
- </ProjectConfiguration>
- <ProjectConfiguration Include="Release|Win32">
- <Configuration>Release</Configuration>
- <Platform>Win32</Platform>
- </ProjectConfiguration>
- </ItemGroup>
- <PropertyGroup Label="Globals">
- <ProjectGuid>{C3AF5C05-D4EC-41D2-8319-D1E69B9B5820}</ProjectGuid>
- <RootNamespace>moseschartcmd</RootNamespace>
- <Keyword>Win32Proj</Keyword>
- </PropertyGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
- <ConfigurationType>Application</ConfigurationType>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>true</WholeProgramOptimization>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
- <ConfigurationType>Application</ConfigurationType>
- <CharacterSet>Unicode</CharacterSet>
- </PropertyGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
- <ImportGroup Label="ExtensionSettings">
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <PropertyGroup Label="UserMacros" />
- <PropertyGroup>
- <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
- <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
- <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
- <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
- <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
- <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
- <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
- </PropertyGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
- <ClCompile>
- <Optimization>Disabled</Optimization>
- <AdditionalIncludeDirectories>C:\Program Files\boost\boost_1_47;$(SolutionDir)/moses/src;$(SolutionDir)/kenlm;$(SolutionDir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;LM_INTERNAL;TRACE_ENABLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <MinimalRebuild>true</MinimalRebuild>
- <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
- <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
- </ClCompile>
- <Link>
- <AdditionalDependencies>zdll.lib;$(SolutionDir)$(Configuration)\moses.lib;$(SolutionDir)$(Configuration)\kenlm.lib;%(AdditionalDependencies)</AdditionalDependencies>
- <GenerateDebugInformation>true</GenerateDebugInformation>
- <SubSystem>Console</SubSystem>
- <RandomizedBaseAddress>false</RandomizedBaseAddress>
- <DataExecutionPrevention>
- </DataExecutionPrevention>
- <TargetMachine>MachineX86</TargetMachine>
- </Link>
- </ItemDefinitionGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
- <ClCompile>
- <AdditionalIncludeDirectories>C:\Program Files\boost\boost_1_47;$(SolutionDir)/moses/src;$(SolutionDir)/kenlm;$(SolutionDir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;LM_INTERNAL;TRACE_ENABLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
- </ClCompile>
- <Link>
- <AdditionalDependencies>zdll.lib;$(SolutionDir)$(Configuration)\moses.lib;$(SolutionDir)$(Configuration)\kenlm.lib;%(AdditionalDependencies)</AdditionalDependencies>
- <GenerateDebugInformation>true</GenerateDebugInformation>
- <SubSystem>Console</SubSystem>
- <OptimizeReferences>true</OptimizeReferences>
- <EnableCOMDATFolding>true</EnableCOMDATFolding>
- <RandomizedBaseAddress>false</RandomizedBaseAddress>
- <DataExecutionPrevention>
- </DataExecutionPrevention>
- <TargetMachine>MachineX86</TargetMachine>
- </Link>
- </ItemDefinitionGroup>
- <ItemGroup>
- <ClCompile Include="src\IOWrapper.cpp" />
- <ClCompile Include="src\Main.cpp" />
- <ClCompile Include="src\mbr.cpp" />
- </ItemGroup>
- <ItemGroup>
- <ClInclude Include="src\IOWrapper.h" />
- <ClInclude Include="src\Main.h" />
- <ClInclude Include="src\mbr.h" />
- </ItemGroup>
- <ItemGroup>
- <ProjectReference Include="..\moses\moses.vcxproj">
- <Project>{8122157a-0de5-44ff-8e5b-024ed6ace7af}</Project>
- <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
- </ProjectReference>
- <ProjectReference Include="..\OnDiskPt\OnDiskPt.vcxproj">
- <Project>{8b07671b-cbaf-4514-affd-ce238cd427e9}</Project>
- <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
- </ProjectReference>
- </ItemGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
- <ImportGroup Label="ExtensionTargets">
- </ImportGroup>
-</Project> \ No newline at end of file
diff --git a/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj b/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj
deleted file mode 100644
index cc0f3caf7..000000000
--- a/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,486 +0,0 @@
-// !$*UTF8*$!
-{
- archiveVersion = 1;
- classes = {
- };
- objectVersion = 45;
- objects = {
-
-/* Begin PBXBuildFile section */
- 1EAF9DC614B9F8CD005E8EBD /* liblm.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EAF9DC314B9F8BA005E8EBD /* liblm.a */; };
- 1EAF9DC714B9F8CD005E8EBD /* libmoses.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EAF9DAD14B9F8AD005E8EBD /* libmoses.a */; };
- 1EAF9DC814B9F8CD005E8EBD /* libOnDiskPt.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EAF9DB614B9F8B1005E8EBD /* libOnDiskPt.a */; };
- 1EBC53E7164C4B1400ADFA2C /* libsearch.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EBC53BD164C4AC300ADFA2C /* libsearch.a */; };
- 1EF0719F14B9F1D40052152A /* IOWrapper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF0718A14B9F1D40052152A /* IOWrapper.cpp */; };
- 1EF071A214B9F1D40052152A /* Main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF0718E14B9F1D40052152A /* Main.cpp */; };
- 1EF071A414B9F1D40052152A /* mbr.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF0719114B9F1D40052152A /* mbr.cpp */; };
- 1EF071A614B9F1D40052152A /* TranslationAnalysis.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF0719414B9F1D40052152A /* TranslationAnalysis.cpp */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXContainerItemProxy section */
- 1EAF9DAC14B9F8AD005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */;
- proxyType = 2;
- remoteGlobalIDString = D2AAC046055464E500DB518D;
- remoteInfo = moses;
- };
- 1EAF9DB514B9F8B1005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */;
- proxyType = 2;
- remoteGlobalIDString = D2AAC046055464E500DB518D;
- remoteInfo = OnDiskPt;
- };
- 1EAF9DC214B9F8BA005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */;
- proxyType = 2;
- remoteGlobalIDString = 1EE8C2E91476A48E002496F2;
- remoteInfo = lm;
- };
- 1EAF9DCB14B9F8D6005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */;
- proxyType = 1;
- remoteGlobalIDString = D2AAC045055464E500DB518D;
- remoteInfo = OnDiskPt;
- };
- 1EAF9DCD14B9F8D6005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */;
- proxyType = 1;
- remoteGlobalIDString = D2AAC045055464E500DB518D;
- remoteInfo = moses;
- };
- 1EAF9DCF14B9F8D6005E8EBD /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */;
- proxyType = 1;
- remoteGlobalIDString = 1EE8C2E81476A48E002496F2;
- remoteInfo = lm;
- };
- 1EBC53BC164C4AC300ADFA2C /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */;
- proxyType = 2;
- remoteGlobalIDString = 1EBC53AE164C4A6200ADFA2C;
- remoteInfo = search;
- };
- 1EBC53E5164C4AFC00ADFA2C /* PBXContainerItemProxy */ = {
- isa = PBXContainerItemProxy;
- containerPortal = 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */;
- proxyType = 1;
- remoteGlobalIDString = 1EBC53AD164C4A6200ADFA2C;
- remoteInfo = search;
- };
-/* End PBXContainerItemProxy section */
-
-/* Begin PBXCopyFilesBuildPhase section */
- 8DD76F690486A84900D96B5E /* CopyFiles */ = {
- isa = PBXCopyFilesBuildPhase;
- buildActionMask = 8;
- dstPath = /usr/share/man/man1/;
- dstSubfolderSpec = 0;
- files = (
- );
- runOnlyForDeploymentPostprocessing = 1;
- };
-/* End PBXCopyFilesBuildPhase section */
-
-/* Begin PBXFileReference section */
- 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = moses.xcodeproj; sourceTree = "<group>"; };
- 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = OnDiskPt.xcodeproj; sourceTree = "<group>"; };
- 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = lm.xcodeproj; sourceTree = "<group>"; };
- 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = search.xcodeproj; sourceTree = "<group>"; };
- 1EF0718A14B9F1D40052152A /* IOWrapper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = IOWrapper.cpp; path = "../../moses-chart-cmd/src/IOWrapper.cpp"; sourceTree = "<group>"; };
- 1EF0718B14B9F1D40052152A /* IOWrapper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = IOWrapper.h; path = "../../moses-chart-cmd/src/IOWrapper.h"; sourceTree = "<group>"; };
- 1EF0718E14B9F1D40052152A /* Main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Main.cpp; path = "../../moses-chart-cmd/src/Main.cpp"; sourceTree = "<group>"; };
- 1EF0718F14B9F1D40052152A /* Main.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Main.h; path = "../../moses-chart-cmd/src/Main.h"; sourceTree = "<group>"; };
- 1EF0719114B9F1D40052152A /* mbr.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = mbr.cpp; path = "../../moses-chart-cmd/src/mbr.cpp"; sourceTree = "<group>"; };
- 1EF0719214B9F1D40052152A /* mbr.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = mbr.h; path = "../../moses-chart-cmd/src/mbr.h"; sourceTree = "<group>"; };
- 1EF0719414B9F1D40052152A /* TranslationAnalysis.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = TranslationAnalysis.cpp; path = "../../moses-chart-cmd/src/TranslationAnalysis.cpp"; sourceTree = "<group>"; };
- 1EF0719514B9F1D40052152A /* TranslationAnalysis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = TranslationAnalysis.h; path = "../../moses-chart-cmd/src/TranslationAnalysis.h"; sourceTree = "<group>"; };
- 8DD76F6C0486A84900D96B5E /* moses-chart-cmd */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "moses-chart-cmd"; sourceTree = BUILT_PRODUCTS_DIR; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
- 8DD76F660486A84900D96B5E /* Frameworks */ = {
- isa = PBXFrameworksBuildPhase;
- buildActionMask = 2147483647;
- files = (
- 1EBC53E7164C4B1400ADFA2C /* libsearch.a in Frameworks */,
- 1EAF9DC614B9F8CD005E8EBD /* liblm.a in Frameworks */,
- 1EAF9DC714B9F8CD005E8EBD /* libmoses.a in Frameworks */,
- 1EAF9DC814B9F8CD005E8EBD /* libOnDiskPt.a in Frameworks */,
- );
- runOnlyForDeploymentPostprocessing = 0;
- };
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
- 08FB7794FE84155DC02AAC07 /* moses-chart-cmd */ = {
- isa = PBXGroup;
- children = (
- 08FB7795FE84155DC02AAC07 /* Source */,
- C6859E8C029090F304C91782 /* Documentation */,
- 1AB674ADFE9D54B511CA2CBB /* Products */,
- 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */,
- 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */,
- 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */,
- 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */,
- );
- name = "moses-chart-cmd";
- sourceTree = "<group>";
- };
- 08FB7795FE84155DC02AAC07 /* Source */ = {
- isa = PBXGroup;
- children = (
- 1EF0718A14B9F1D40052152A /* IOWrapper.cpp */,
- 1EF0718B14B9F1D40052152A /* IOWrapper.h */,
- 1EF0718E14B9F1D40052152A /* Main.cpp */,
- 1EF0718F14B9F1D40052152A /* Main.h */,
- 1EF0719114B9F1D40052152A /* mbr.cpp */,
- 1EF0719214B9F1D40052152A /* mbr.h */,
- 1EF0719414B9F1D40052152A /* TranslationAnalysis.cpp */,
- 1EF0719514B9F1D40052152A /* TranslationAnalysis.h */,
- );
- name = Source;
- sourceTree = "<group>";
- };
- 1AB674ADFE9D54B511CA2CBB /* Products */ = {
- isa = PBXGroup;
- children = (
- 8DD76F6C0486A84900D96B5E /* moses-chart-cmd */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- 1EAF9DA614B9F8AD005E8EBD /* Products */ = {
- isa = PBXGroup;
- children = (
- 1EAF9DAD14B9F8AD005E8EBD /* libmoses.a */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- 1EAF9DAF14B9F8B1005E8EBD /* Products */ = {
- isa = PBXGroup;
- children = (
- 1EAF9DB614B9F8B1005E8EBD /* libOnDiskPt.a */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- 1EAF9DB814B9F8B9005E8EBD /* Products */ = {
- isa = PBXGroup;
- children = (
- 1EAF9DC314B9F8BA005E8EBD /* liblm.a */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- 1EBC53B6164C4AC300ADFA2C /* Products */ = {
- isa = PBXGroup;
- children = (
- 1EBC53BD164C4AC300ADFA2C /* libsearch.a */,
- );
- name = Products;
- sourceTree = "<group>";
- };
- C6859E8C029090F304C91782 /* Documentation */ = {
- isa = PBXGroup;
- children = (
- );
- name = Documentation;
- sourceTree = "<group>";
- };
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
- 8DD76F620486A84900D96B5E /* moses-chart-cmd */ = {
- isa = PBXNativeTarget;
- buildConfigurationList = 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "moses-chart-cmd" */;
- buildPhases = (
- 8DD76F640486A84900D96B5E /* Sources */,
- 8DD76F660486A84900D96B5E /* Frameworks */,
- 8DD76F690486A84900D96B5E /* CopyFiles */,
- );
- buildRules = (
- );
- dependencies = (
- 1EBC53E6164C4AFC00ADFA2C /* PBXTargetDependency */,
- 1EAF9DCC14B9F8D6005E8EBD /* PBXTargetDependency */,
- 1EAF9DCE14B9F8D6005E8EBD /* PBXTargetDependency */,
- 1EAF9DD014B9F8D6005E8EBD /* PBXTargetDependency */,
- );
- name = "moses-chart-cmd";
- productInstallPath = "$(HOME)/bin";
- productName = "moses-chart-cmd";
- productReference = 8DD76F6C0486A84900D96B5E /* moses-chart-cmd */;
- productType = "com.apple.product-type.tool";
- };
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
- 08FB7793FE84155DC02AAC07 /* Project object */ = {
- isa = PBXProject;
- buildConfigurationList = 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "moses-chart-cmd" */;
- compatibilityVersion = "Xcode 3.1";
- developmentRegion = English;
- hasScannedForEncodings = 1;
- knownRegions = (
- English,
- Japanese,
- French,
- German,
- );
- mainGroup = 08FB7794FE84155DC02AAC07 /* moses-chart-cmd */;
- projectDirPath = "";
- projectReferences = (
- {
- ProductGroup = 1EAF9DB814B9F8B9005E8EBD /* Products */;
- ProjectRef = 1EAF9DB714B9F8B9005E8EBD /* lm.xcodeproj */;
- },
- {
- ProductGroup = 1EAF9DA614B9F8AD005E8EBD /* Products */;
- ProjectRef = 1EAF9DA514B9F8AD005E8EBD /* moses.xcodeproj */;
- },
- {
- ProductGroup = 1EAF9DAF14B9F8B1005E8EBD /* Products */;
- ProjectRef = 1EAF9DAE14B9F8B1005E8EBD /* OnDiskPt.xcodeproj */;
- },
- {
- ProductGroup = 1EBC53B6164C4AC300ADFA2C /* Products */;
- ProjectRef = 1EBC53B5164C4AC300ADFA2C /* search.xcodeproj */;
- },
- );
- projectRoot = "";
- targets = (
- 8DD76F620486A84900D96B5E /* moses-chart-cmd */,
- );
- };
-/* End PBXProject section */
-
-/* Begin PBXReferenceProxy section */
- 1EAF9DAD14B9F8AD005E8EBD /* libmoses.a */ = {
- isa = PBXReferenceProxy;
- fileType = archive.ar;
- path = libmoses.a;
- remoteRef = 1EAF9DAC14B9F8AD005E8EBD /* PBXContainerItemProxy */;
- sourceTree = BUILT_PRODUCTS_DIR;
- };
- 1EAF9DB614B9F8B1005E8EBD /* libOnDiskPt.a */ = {
- isa = PBXReferenceProxy;
- fileType = archive.ar;
- path = libOnDiskPt.a;
- remoteRef = 1EAF9DB514B9F8B1005E8EBD /* PBXContainerItemProxy */;
- sourceTree = BUILT_PRODUCTS_DIR;
- };
- 1EAF9DC314B9F8BA005E8EBD /* liblm.a */ = {
- isa = PBXReferenceProxy;
- fileType = archive.ar;
- path = liblm.a;
- remoteRef = 1EAF9DC214B9F8BA005E8EBD /* PBXContainerItemProxy */;
- sourceTree = BUILT_PRODUCTS_DIR;
- };
- 1EBC53BD164C4AC300ADFA2C /* libsearch.a */ = {
- isa = PBXReferenceProxy;
- fileType = archive.ar;
- path = libsearch.a;
- remoteRef = 1EBC53BC164C4AC300ADFA2C /* PBXContainerItemProxy */;
- sourceTree = BUILT_PRODUCTS_DIR;
- };
-/* End PBXReferenceProxy section */
-
-/* Begin PBXSourcesBuildPhase section */
- 8DD76F640486A84900D96B5E /* Sources */ = {
- isa = PBXSourcesBuildPhase;
- buildActionMask = 2147483647;
- files = (
- 1EF0719F14B9F1D40052152A /* IOWrapper.cpp in Sources */,
- 1EF071A214B9F1D40052152A /* Main.cpp in Sources */,
- 1EF071A414B9F1D40052152A /* mbr.cpp in Sources */,
- 1EF071A614B9F1D40052152A /* TranslationAnalysis.cpp in Sources */,
- );
- runOnlyForDeploymentPostprocessing = 0;
- };
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXTargetDependency section */
- 1EAF9DCC14B9F8D6005E8EBD /* PBXTargetDependency */ = {
- isa = PBXTargetDependency;
- name = OnDiskPt;
- targetProxy = 1EAF9DCB14B9F8D6005E8EBD /* PBXContainerItemProxy */;
- };
- 1EAF9DCE14B9F8D6005E8EBD /* PBXTargetDependency */ = {
- isa = PBXTargetDependency;
- name = moses;
- targetProxy = 1EAF9DCD14B9F8D6005E8EBD /* PBXContainerItemProxy */;
- };
- 1EAF9DD014B9F8D6005E8EBD /* PBXTargetDependency */ = {
- isa = PBXTargetDependency;
- name = lm;
- targetProxy = 1EAF9DCF14B9F8D6005E8EBD /* PBXContainerItemProxy */;
- };
- 1EBC53E6164C4AFC00ADFA2C /* PBXTargetDependency */ = {
- isa = PBXTargetDependency;
- name = search;
- targetProxy = 1EBC53E5164C4AFC00ADFA2C /* PBXContainerItemProxy */;
- };
-/* End PBXTargetDependency section */
-
-/* Begin XCBuildConfiguration section */
- 1DEB923208733DC60010E9CD /* Debug */ = {
- isa = XCBuildConfiguration;
- buildSettings = {
- ALWAYS_SEARCH_USER_PATHS = NO;
- ARCHS = "$(ARCHS_STANDARD_64_BIT)";
- COPY_PHASE_STRIP = NO;
- GCC_DYNAMIC_NO_PIC = NO;
- GCC_ENABLE_FIX_AND_CONTINUE = YES;
- GCC_MODEL_TUNING = G5;
- GCC_OPTIMIZATION_LEVEL = 0;
- GCC_PREPROCESSOR_DEFINITIONS = (
- TRACE_ENABLE,
- _LARGE_FILES,
- "_FILE_OFFSET_BITS=64",
- "MAX_NUM_FACTORS=4",
- );
- HEADER_SEARCH_PATHS = /opt/local/include;
- INSTALL_PATH = /usr/local/bin;
- LIBRARY_SEARCH_PATHS = (
- ../../irstlm/lib,
- ../../srilm/lib/macosx,
- /opt/local/lib,
- ../../cmph/lib,
- );
- OTHER_LDFLAGS = (
- "-lz",
- "-lirstlm",
- "-lmisc",
- "-ldstruct",
- "-loolm",
- "-lflm",
- "-llattice",
- "-lboost_thread-mt",
- "-lboost_filesystem-mt",
- "-lboost_system-mt",
- "-lcmph",
- );
- PRODUCT_NAME = "moses-chart-cmd";
- SDKROOT = "";
- USER_HEADER_SEARCH_PATHS = "../../ ../../moses/src";
- };
- name = Debug;
- };
- 1DEB923308733DC60010E9CD /* Release */ = {
- isa = XCBuildConfiguration;
- buildSettings = {
- ALWAYS_SEARCH_USER_PATHS = NO;
- ARCHS = "$(ARCHS_STANDARD_64_BIT)";
- DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
- GCC_MODEL_TUNING = G5;
- GCC_PREPROCESSOR_DEFINITIONS = (
- TRACE_ENABLE,
- _LARGE_FILES,
- "_FILE_OFFSET_BITS=64",
- "MAX_NUM_FACTORS=4",
- );
- HEADER_SEARCH_PATHS = /opt/local/include;
- INSTALL_PATH = /usr/local/bin;
- LIBRARY_SEARCH_PATHS = (
- ../../irstlm/lib,
- ../../srilm/lib/macosx,
- /opt/local/lib,
- ../../cmph/lib,
- );
- OTHER_LDFLAGS = (
- "-lz",
- "-lirstlm",
- "-lmisc",
- "-ldstruct",
- "-loolm",
- "-lflm",
- "-llattice",
- "-lboost_thread-mt",
- "-lboost_filesystem-mt",
- "-lboost_system-mt",
- "-lcmph",
- );
- PRODUCT_NAME = "moses-chart-cmd";
- SDKROOT = "";
- USER_HEADER_SEARCH_PATHS = "../../ ../../moses/src";
- };
- name = Release;
- };
- 1DEB923608733DC60010E9CD /* Debug */ = {
- isa = XCBuildConfiguration;
- buildSettings = {
- ARCHS = "$(ARCHS_STANDARD_64_BIT)";
- GCC_C_LANGUAGE_STANDARD = gnu99;
- GCC_OPTIMIZATION_LEVEL = 0;
- GCC_PREPROCESSOR_DEFINITIONS = (
- TRACE_ENABLE,
- WITH_THREADS,
- );
- GCC_WARN_ABOUT_RETURN_TYPE = YES;
- GCC_WARN_UNUSED_VARIABLE = YES;
- HEADER_SEARCH_PATHS = (
- ../../moses/src,
- ../..,
- "/Users/hieuhoang/workspace/github/moses-smt/moses/src/**",
- );
- ONLY_ACTIVE_ARCH = YES;
- PREBINDING = NO;
- SDKROOT = "";
- };
- name = Debug;
- };
- 1DEB923708733DC60010E9CD /* Release */ = {
- isa = XCBuildConfiguration;
- buildSettings = {
- ARCHS = "$(ARCHS_STANDARD_64_BIT)";
- GCC_C_LANGUAGE_STANDARD = gnu99;
- GCC_PREPROCESSOR_DEFINITIONS = (
- TRACE_ENABLE,
- WITH_THREADS,
- );
- GCC_WARN_ABOUT_RETURN_TYPE = YES;
- GCC_WARN_UNUSED_VARIABLE = YES;
- HEADER_SEARCH_PATHS = (
- ../../moses/src,
- ../..,
- "/Users/hieuhoang/workspace/github/moses-smt/moses/src/**",
- );
- ONLY_ACTIVE_ARCH = YES;
- PREBINDING = NO;
- SDKROOT = "";
- };
- name = Release;
- };
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
- 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "moses-chart-cmd" */ = {
- isa = XCConfigurationList;
- buildConfigurations = (
- 1DEB923208733DC60010E9CD /* Debug */,
- 1DEB923308733DC60010E9CD /* Release */,
- );
- defaultConfigurationIsVisible = 0;
- defaultConfigurationName = Release;
- };
- 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "moses-chart-cmd" */ = {
- isa = XCConfigurationList;
- buildConfigurations = (
- 1DEB923608733DC60010E9CD /* Debug */,
- 1DEB923708733DC60010E9CD /* Release */,
- );
- defaultConfigurationIsVisible = 0;
- defaultConfigurationName = Release;
- };
-/* End XCConfigurationList section */
- };
- rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
-}
diff --git a/contrib/other-builds/moses-chart-cmd/.project b/contrib/other-builds/moses-chart-cmd/.project
deleted file mode 100644
index 5022e23ff..000000000
--- a/contrib/other-builds/moses-chart-cmd/.project
+++ /dev/null
@@ -1,135 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
- <name>moses-chart-cmd</name>
- <comment></comment>
- <projects>
- <project>lm</project>
- <project>moses</project>
- <project>OnDiskPt</project>
- <project>search</project>
- <project>util</project>
- </projects>
- <buildSpec>
- <buildCommand>
- <name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
- <triggers>clean,full,incremental,</triggers>
- <arguments>
- <dictionary>
- <key>?name?</key>
- <value></value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.append_environment</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.autoBuildTarget</key>
- <value>all</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.buildArguments</key>
- <value>-j3</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.buildCommand</key>
- <value>make</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.buildLocation</key>
- <value>${workspace_loc:/moses-chart-cmd/Debug}</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
- <value>clean</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.contents</key>
- <value>org.eclipse.cdt.make.core.activeConfigSettings</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.enableAutoBuild</key>
- <value>false</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.enableCleanBuild</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.enableFullBuild</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.fullBuildTarget</key>
- <value>all</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.stopOnError</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
- <value>true</value>
- </dictionary>
- </arguments>
- </buildCommand>
- <buildCommand>
- <name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
- <triggers>full,incremental,</triggers>
- <arguments>
- </arguments>
- </buildCommand>
- </buildSpec>
- <natures>
- <nature>org.eclipse.cdt.core.cnature</nature>
- <nature>org.eclipse.cdt.core.ccnature</nature>
- <nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
- <nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
- </natures>
- <linkedResources>
- <link>
- <name>IOWrapper.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/IOWrapper.cpp</locationURI>
- </link>
- <link>
- <name>IOWrapper.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/IOWrapper.h</locationURI>
- </link>
- <link>
- <name>Jamfile</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/Jamfile</locationURI>
- </link>
- <link>
- <name>Main.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/Main.cpp</locationURI>
- </link>
- <link>
- <name>Main.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/Main.h</locationURI>
- </link>
- <link>
- <name>TranslationAnalysis.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/TranslationAnalysis.cpp</locationURI>
- </link>
- <link>
- <name>TranslationAnalysis.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/TranslationAnalysis.h</locationURI>
- </link>
- <link>
- <name>mbr.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/mbr.cpp</locationURI>
- </link>
- <link>
- <name>mbr.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-chart-cmd/mbr.h</locationURI>
- </link>
- </linkedResources>
-</projectDescription>
diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject
index 828b71395..6ed3d4818 100644
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@@ -5,13 +5,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.461114338" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -46,15 +46,7 @@
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.2096997198" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../DALM/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../nplm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../randlm/lib&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../cmph/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/OnDiskPt/Debug&quot;"/>
@@ -64,22 +56,15 @@
</option>
<option id="gnu.cpp.link.option.libs.998577284" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="moses"/>
- <listOptionValue builtIn="false" value="irstlm"/>
- <listOptionValue builtIn="false" value="dstruct"/>
- <listOptionValue builtIn="false" value="flm"/>
- <listOptionValue builtIn="false" value="oolm"/>
- <listOptionValue builtIn="false" value="lattice"/>
- <listOptionValue builtIn="false" value="misc"/>
- <listOptionValue builtIn="false" value="dalm"/>
<listOptionValue builtIn="false" value="search"/>
- <listOptionValue builtIn="false" value="RandLM"/>
<listOptionValue builtIn="false" value="OnDiskPt"/>
<listOptionValue builtIn="false" value="lm"/>
<listOptionValue builtIn="false" value="util"/>
- <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
- <listOptionValue builtIn="false" value="boost_system-mt"/>
- <listOptionValue builtIn="false" value="boost_thread-mt"/>
- <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
+ <listOptionValue builtIn="false" value="boost_serialization"/>
+ <listOptionValue builtIn="false" value="boost_system"/>
+ <listOptionValue builtIn="false" value="boost_thread"/>
+ <listOptionValue builtIn="false" value="boost_filesystem"/>
<listOptionValue builtIn="false" value="pthread"/>
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="bz2"/>
@@ -108,13 +93,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.2121690436" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -179,4 +164,5 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+ <storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>
diff --git a/contrib/other-builds/moses-cmd/.project b/contrib/other-builds/moses-cmd/.project
index 541d60b6f..312c61654 100644
--- a/contrib/other-builds/moses-cmd/.project
+++ b/contrib/other-builds/moses-cmd/.project
@@ -87,31 +87,11 @@
</natures>
<linkedResources>
<link>
- <name>IOWrapper.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/IOWrapper.cpp</locationURI>
- </link>
- <link>
- <name>IOWrapper.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/IOWrapper.h</locationURI>
- </link>
- <link>
<name>Jamfile</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses-cmd/Jamfile</locationURI>
</link>
<link>
- <name>LatticeMBR.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/LatticeMBR.cpp</locationURI>
- </link>
- <link>
- <name>LatticeMBR.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/LatticeMBR.h</locationURI>
- </link>
- <link>
<name>LatticeMBRGrid.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses-cmd/LatticeMBRGrid.cpp</locationURI>
@@ -126,25 +106,5 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses-cmd/Main.h</locationURI>
</link>
- <link>
- <name>TranslationAnalysis.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/TranslationAnalysis.cpp</locationURI>
- </link>
- <link>
- <name>TranslationAnalysis.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/TranslationAnalysis.h</locationURI>
- </link>
- <link>
- <name>mbr.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/mbr.cpp</locationURI>
- </link>
- <link>
- <name>mbr.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses-cmd/mbr.h</locationURI>
- </link>
</linkedResources>
</projectDescription>
diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject
index 862a1deb1..2fd2601c6 100644
--- a/contrib/other-builds/moses/.cproject
+++ b/contrib/other-builds/moses/.cproject
@@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.1846963597">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1846963597" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings>
<externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/>
@@ -16,76 +16,67 @@
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactExtension="a" artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.staticLib" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.staticLib" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.656913512" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
- <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
- <tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
- <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.2123672332" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.57896781" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <configuration artifactExtension="a" artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.staticLib" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.staticLib" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.1846963597" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1167373278" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.397694981" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/moses}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1778877633" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1097285966" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1729217620" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1455257477" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.227767392" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.876218169" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../cmph/include&quot;"/>
- <listOptionValue builtIn="false" value="/opt/local/include/"/>
- <listOptionValue builtIn="false" value="${workspace_loc}/../../irstlm/include"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../nplm/src&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../eigen&quot;"/>
- <listOptionValue builtIn="false" value="${workspace_loc}/../../srilm/include"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../DALM/include&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../DALM/darts-clone&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../randlm/include/RandLM&quot;"/>
- <listOptionValue builtIn="false" value="${workspace_loc}/../../"/>
</option>
- <option id="gnu.cpp.compiler.option.preprocessor.def.752586397" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
- <listOptionValue builtIn="false" value="IS_ECLIPSE"/>
- <listOptionValue builtIn="false" value="HAVE_BOOST"/>
+ <option id="gnu.cpp.compiler.option.preprocessor.def.53427549" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="MAX_NUM_FACTORS=4"/>
- <listOptionValue builtIn="false" value="WITH_THREADS"/>
<listOptionValue builtIn="false" value="KENLM_MAX_ORDER=7"/>
- <listOptionValue builtIn="false" value="TRACE_ENABLE"/>
- <listOptionValue builtIn="false" value="LM_DALM"/>
- <listOptionValue builtIn="false" value="LM_IRST"/>
- <listOptionValue builtIn="false" value="LM_RAND"/>
- <listOptionValue builtIn="false" value="LM_NPLM"/>
- <listOptionValue builtIn="false" value="_FILE_OFFSET_BIT=64"/>
+ <listOptionValue builtIn="false" value="WITH_THREADS"/>
+ <listOptionValue builtIn="false" value="_FILE_OFFSET_BITS=64"/>
<listOptionValue builtIn="false" value="_LARGE_FILES"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1905116220" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1023855536" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.2126314903" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.1524900118" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.exe.debug.option.debugging.level.581728958" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.877210753" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1313249282" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.146557271" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.1656486500" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.570559630" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1168585173" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.2074660557" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.340054018" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1471271407" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1144959654" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.323925091" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.933467113" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.99047750" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1362368838" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.383587863" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
- <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.511477442" name="Rand.h" rcbsApplicability="disable" resourcePath="LM/Rand.h" toolsToInvoke=""/>
- <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.790052015" name="IRST.h" rcbsApplicability="disable" resourcePath="LM/IRST.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1113398114" name="Rand.h" rcbsApplicability="disable" resourcePath="LM/Rand.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1183410636" name="ORLM.h" rcbsApplicability="disable" resourcePath="LM/ORLM.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1448475064" name="IRST.h" rcbsApplicability="disable" resourcePath="LM/IRST.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1459438132" name="DALMWrapper.h" rcbsApplicability="disable" resourcePath="LM/DALMWrapper.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1094892289" name="MaxEntSRI.h" rcbsApplicability="disable" resourcePath="LM/MaxEntSRI.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1720439764" name="NeuralLMWrapper.h" rcbsApplicability="disable" resourcePath="LM/NeuralLMWrapper.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1272004353" name="BilingualLM.h" rcbsApplicability="disable" resourcePath="LM/BilingualLM.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1815042864" name="SRI.h" rcbsApplicability="disable" resourcePath="LM/SRI.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.871386239" name="LDHT.h" rcbsApplicability="disable" resourcePath="LM/LDHT.h" toolsToInvoke=""/>
+ <fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1761300858" name="ParallelBackoff.h" rcbsApplicability="disable" resourcePath="LM/ParallelBackoff.h" toolsToInvoke=""/>
<sourceEntries>
- <entry excluding="TranslationModel/CompactPT|LM/NeuralLMWrapper.cpp|FF/PhraseLengthFeatureTest.cpp|PhraseLengthFeatureTest.cpp|LM/BackwardTest.cpp|LM/BackwardLMState.h|LM/BackwardLMState.cpp|LM/Backward.h|LM/Backward.cpp|FeatureVectorTest.cpp|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|src/SyntacticLanguageModelState.h|src/SyntacticLanguageModelFiles.h|src/SyntacticLanguageModel.h|src/SyntacticLanguageModel.cpp|src/LM/SRI.h|src/LM/SRI.cpp|src/LM/Rand.h|src/LM/Rand.cpp|src/LM/LDHT.h|src/LM/LDHT.cpp|SyntacticLanguageModelState.h|SyntacticLanguageModelFiles.h|SyntacticLanguageModel.h|SyntacticLanguageModel.cpp|LM/LDHT.h|LM/LDHT.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+ <entry excluding="LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|LM/bilingual-lm|LM/MaxEntSRI.h|LM/MaxEntSRI.cpp|LM/BilingualLM.h|LM/BilingualLM.cpp|TranslationModel/CompactPT|LM/Rand.h|LM/Rand.cpp|LM/LDHT.h|LM/LDHT.cpp|LM/ORLM.h|LM/ORLM.cpp|LM/NeuralLMWrapper.h|LM/NeuralLMWrapper.cpp|LM/SRI.h|LM/SRI.cpp|LM/IRST.h|LM/IRST.cpp|LM/DALMWrapper.h|LM/DALMWrapper.cpp|LM/oxlm|TranslationModel/ProbingPT|TranslationModel/UG|TranslationModel/UG/util" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.401150096">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.401150096" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.1911984684">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1911984684" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
@@ -94,35 +85,34 @@
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
- <extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.401150096" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.exe.release.401150096." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.36295137" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
- <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.release.538725710" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
- <builder buildPath="${workspace_loc:/moses/Release}" id="cdt.managedbuild.target.gnu.builder.exe.release.1875953334" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
- <tool id="cdt.managedbuild.tool.gnu.archiver.base.1633496039" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.2060881562" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
- <option id="gnu.cpp.compiler.exe.release.option.optimization.level.1375372870" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.exe.release.option.debugging.level.815283803" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1020483420" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1911984684" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.1911984684." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1552241309" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.332871558" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/moses}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.391025866" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1623685179" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1914197251" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.2144875045" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.9472765" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1143887599" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.85324871" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1137534635" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.exe.release.option.debugging.level.143589037" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.304912704" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1469504539" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1950806117" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.1109082339" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2103068478" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.283583965" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
- <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.2059280959" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2020956494" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.105686784" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.888859695" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1025399565" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.782286837" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1766138143" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.136178961" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1636213141" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -132,35 +122,24 @@
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="moses.cdt.managedbuild.target.gnu.exe.1375079569" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
+ <project id="moses.cdt.managedbuild.target.gnu.exe.129952716" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426;cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.;cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.401409202;cdt.managedbuild.tool.gnu.c.compiler.input.1919272901">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.macosx.exe.release.722580523;cdt.managedbuild.config.macosx.exe.release.722580523.;cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.release.1404156839;cdt.managedbuild.tool.gnu.c.compiler.input.1172147378">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.401150096;cdt.managedbuild.config.gnu.exe.release.401150096.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.85324871;cdt.managedbuild.tool.gnu.c.compiler.input.304912704">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1846963597;cdt.managedbuild.config.gnu.exe.debug.1846963597.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1313249282;cdt.managedbuild.tool.gnu.c.compiler.input.570559630">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.656913512;cdt.managedbuild.config.gnu.exe.debug.656913512.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327;cdt.managedbuild.tool.gnu.cpp.compiler.input.1905116220">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1911984684;cdt.managedbuild.config.gnu.exe.release.1911984684.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1914197251;cdt.managedbuild.tool.gnu.cpp.compiler.input.1143887599">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426;cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.;cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1867588805;cdt.managedbuild.tool.gnu.cpp.compiler.input.1110302565">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1846963597;cdt.managedbuild.config.gnu.exe.debug.1846963597.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1729217620;cdt.managedbuild.tool.gnu.cpp.compiler.input.1023855536">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.macosx.exe.release.722580523;cdt.managedbuild.config.macosx.exe.release.722580523.;cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.release.1662892925;cdt.managedbuild.tool.gnu.cpp.compiler.input.936283391">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.656913512;cdt.managedbuild.config.gnu.exe.debug.656913512.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.2126314903;cdt.managedbuild.tool.gnu.c.compiler.input.877210753">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
- </scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.401150096;cdt.managedbuild.config.gnu.exe.release.401150096.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.2060881562;cdt.managedbuild.tool.gnu.cpp.compiler.input.1020483420">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1911984684;cdt.managedbuild.config.gnu.exe.release.1911984684.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1469504539;cdt.managedbuild.tool.gnu.c.compiler.input.2103068478">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/moses"/>
@@ -170,5 +149,4 @@
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
- <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index f14f534d1..d26a65d86 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -9,62 +9,6 @@
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
- <dictionary>
- <key>?name?</key>
- <value></value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.append_environment</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.autoBuildTarget</key>
- <value>all</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.buildArguments</key>
- <value>-j3</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.buildCommand</key>
- <value>make</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.buildLocation</key>
- <value>${workspace_loc:/moses/Debug}</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
- <value>clean</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.contents</key>
- <value>org.eclipse.cdt.make.core.activeConfigSettings</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.enableAutoBuild</key>
- <value>false</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.enableCleanBuild</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.enableFullBuild</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.fullBuildTarget</key>
- <value>all</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.stopOnError</key>
- <value>true</value>
- </dictionary>
- <dictionary>
- <key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
- <value>true</value>
- </dictionary>
</arguments>
</buildCommand>
<buildCommand>
@@ -82,6 +26,16 @@
</natures>
<linkedResources>
<link>
+ <name>Alignment.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Alignment.cpp</locationURI>
+ </link>
+ <link>
+ <name>Alignment.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/Alignment.h</locationURI>
+ </link>
+ <link>
<name>AlignmentInfo.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/AlignmentInfo.cpp</locationURI>
@@ -107,6 +61,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/AlignmentInfoTest.cpp</locationURI>
</link>
<link>
+ <name>BaseManager.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/BaseManager.cpp</locationURI>
+ </link>
+ <link>
+ <name>BaseManager.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/BaseManager.h</locationURI>
+ </link>
+ <link>
<name>BitmapContainer.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/BitmapContainer.cpp</locationURI>
@@ -117,6 +81,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/BitmapContainer.h</locationURI>
</link>
<link>
+ <name>CMakeLists.txt</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/CMakeLists.txt</locationURI>
+ </link>
+ <link>
<name>ChartCell.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartCell.cpp</locationURI>
@@ -377,6 +346,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/GenerationDictionary.h</locationURI>
</link>
<link>
+ <name>HypergraphOutput.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/HypergraphOutput.cpp</locationURI>
+ </link>
+ <link>
+ <name>HypergraphOutput.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/HypergraphOutput.h</locationURI>
+ </link>
+ <link>
<name>HypoList.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/HypoList.h</locationURI>
@@ -422,6 +401,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/HypothesisStackNormal.h</locationURI>
</link>
<link>
+ <name>IOWrapper.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/IOWrapper.cpp</locationURI>
+ </link>
+ <link>
+ <name>IOWrapper.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/IOWrapper.h</locationURI>
+ </link>
+ <link>
<name>Incremental.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Incremental.cpp</locationURI>
@@ -482,6 +471,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/LVoc.h</locationURI>
</link>
<link>
+ <name>LatticeMBR.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LatticeMBR.cpp</locationURI>
+ </link>
+ <link>
+ <name>LatticeMBR.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LatticeMBR.h</locationURI>
+ </link>
+ <link>
<name>Manager.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Manager.cpp</locationURI>
@@ -537,6 +536,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/PCNTools.h</locationURI>
</link>
<link>
+ <name>PDTAimp.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PDTAimp.cpp</locationURI>
+ </link>
+ <link>
<name>PDTAimp.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PDTAimp.h</locationURI>
@@ -602,16 +606,6 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/ReorderingConstraint.h</locationURI>
</link>
<link>
- <name>ReorderingStack.cpp</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/ReorderingStack.cpp</locationURI>
- </link>
- <link>
- <name>ReorderingStack.h</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/ReorderingStack.h</locationURI>
- </link>
- <link>
<name>RuleCube.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/RuleCube.cpp</locationURI>
@@ -762,6 +756,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/SyntacticLanguageModelState.h</locationURI>
</link>
<link>
+ <name>Syntax</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>TargetPhrase.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TargetPhrase.cpp</locationURI>
@@ -807,6 +806,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/Timer.h</locationURI>
</link>
<link>
+ <name>TranslationAnalysis.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationAnalysis.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationAnalysis.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationAnalysis.h</locationURI>
+ </link>
+ <link>
<name>TranslationModel</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
@@ -872,6 +881,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationOptionList.h</locationURI>
</link>
<link>
+ <name>TranslationTask.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationTask.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationTask.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationTask.h</locationURI>
+ </link>
+ <link>
<name>TreeInput.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TreeInput.cpp</locationURI>
@@ -992,6 +1011,11 @@
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
+ <name>extract-ghkm</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>gzfilebuf.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/gzfilebuf.h</locationURI>
@@ -1002,6 +1026,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/hypergraph.proto</locationURI>
</link>
<link>
+ <name>mbr.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/mbr.cpp</locationURI>
+ </link>
+ <link>
+ <name>mbr.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/mbr.h</locationURI>
+ </link>
+ <link>
<name>rule.proto</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/rule.proto</locationURI>
@@ -1157,14 +1191,14 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/InputFeature.h</locationURI>
</link>
<link>
- <name>FF/InternalStructStatelessFF.cpp</name>
+ <name>FF/InternalTree.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/FF/InternalStructStatelessFF.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/InternalTree.cpp</locationURI>
</link>
<link>
- <name>FF/InternalStructStatelessFF.h</name>
+ <name>FF/InternalTree.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/FF/InternalStructStatelessFF.h</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/InternalTree.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering</name>
@@ -1217,9 +1251,14 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseLengthFeature.h</locationURI>
</link>
<link>
- <name>FF/PhraseLengthFeatureTest.cpp</name>
+ <name>FF/PhraseOrientationFeature.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseLengthFeatureTest.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseOrientationFeature.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/PhraseOrientationFeature.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseOrientationFeature.h</locationURI>
</link>
<link>
<name>FF/PhrasePairFeature.cpp</name>
@@ -1272,6 +1311,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SetSourcePhrase.h</locationURI>
</link>
<link>
+ <name>FF/SkeletonChangeInput.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SkeletonChangeInput.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/SkeletonChangeInput.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SkeletonChangeInput.h</locationURI>
+ </link>
+ <link>
<name>FF/SkeletonStatefulFF.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SkeletonStatefulFF.cpp</locationURI>
@@ -1302,6 +1351,26 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftMatchingFeature.h</locationURI>
</link>
<link>
+ <name>FF/SoftSourceSyntacticConstraintsFeature.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/SoftSourceSyntacticConstraintsFeature.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftSourceSyntacticConstraintsFeature.h</locationURI>
+ </link>
+ <link>
+ <name>FF/SourceGHKMTreeInputMatchFeature.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SourceGHKMTreeInputMatchFeature.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/SourceGHKMTreeInputMatchFeature.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SourceGHKMTreeInputMatchFeature.h</locationURI>
+ </link>
+ <link>
<name>FF/SourceWordDeletionFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SourceWordDeletionFeature.cpp</locationURI>
@@ -1312,6 +1381,31 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SourceWordDeletionFeature.h</locationURI>
</link>
<link>
+ <name>FF/SpanLength.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SpanLength.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/SpanLength.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SpanLength.h</locationURI>
+ </link>
+ <link>
+ <name>FF/SparseHieroReorderingFeature.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SparseHieroReorderingFeature.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/SparseHieroReorderingFeature.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SparseHieroReorderingFeature.h</locationURI>
+ </link>
+ <link>
+ <name>FF/SparseHieroReorderingFeatureTest.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SparseHieroReorderingFeatureTest.cpp</locationURI>
+ </link>
+ <link>
<name>FF/StatefulFeatureFunction.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/StatefulFeatureFunction.cpp</locationURI>
@@ -1332,6 +1426,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/StatelessFeatureFunction.h</locationURI>
</link>
<link>
+ <name>FF/SyntaxRHS.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SyntaxRHS.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/SyntaxRHS.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/SyntaxRHS.h</locationURI>
+ </link>
+ <link>
<name>FF/TargetBigramFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetBigramFeature.cpp</locationURI>
@@ -1437,6 +1541,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/Base.h</locationURI>
</link>
<link>
+ <name>LM/BilingualLM.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/BilingualLM.cpp</locationURI>
+ </link>
+ <link>
+ <name>LM/BilingualLM.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/BilingualLM.h</locationURI>
+ </link>
+ <link>
<name>LM/ChartState.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/ChartState.h</locationURI>
@@ -1507,6 +1621,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/LDHT.h</locationURI>
</link>
<link>
+ <name>LM/MaxEntSRI.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/MaxEntSRI.cpp</locationURI>
+ </link>
+ <link>
+ <name>LM/MaxEntSRI.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/MaxEntSRI.h</locationURI>
+ </link>
+ <link>
<name>LM/MultiFactor.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/MultiFactor.cpp</locationURI>
@@ -1607,6 +1731,31 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/backward.arpa</locationURI>
</link>
<link>
+ <name>LM/bilingual-lm</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/bin</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>PP/CountsPhraseProperty.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/CountsPhraseProperty.cpp</locationURI>
+ </link>
+ <link>
+ <name>PP/CountsPhraseProperty.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/CountsPhraseProperty.h</locationURI>
+ </link>
+ <link>
<name>PP/Factory.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/Factory.cpp</locationURI>
@@ -1617,16 +1766,181 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/Factory.h</locationURI>
</link>
<link>
+ <name>PP/NonTermContextProperty.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/NonTermContextProperty.cpp</locationURI>
+ </link>
+ <link>
+ <name>PP/NonTermContextProperty.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/NonTermContextProperty.h</locationURI>
+ </link>
+ <link>
+ <name>PP/OrientationPhraseProperty.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/OrientationPhraseProperty.cpp</locationURI>
+ </link>
+ <link>
+ <name>PP/OrientationPhraseProperty.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/OrientationPhraseProperty.h</locationURI>
+ </link>
+ <link>
+ <name>PP/PhraseProperty.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/PhraseProperty.cpp</locationURI>
+ </link>
+ <link>
<name>PP/PhraseProperty.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/PhraseProperty.h</locationURI>
</link>
<link>
+ <name>PP/SourceLabelsPhraseProperty.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/SourceLabelsPhraseProperty.cpp</locationURI>
+ </link>
+ <link>
+ <name>PP/SourceLabelsPhraseProperty.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/SourceLabelsPhraseProperty.h</locationURI>
+ </link>
+ <link>
+ <name>PP/SpanLengthPhraseProperty.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/SpanLengthPhraseProperty.cpp</locationURI>
+ </link>
+ <link>
+ <name>PP/SpanLengthPhraseProperty.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/SpanLengthPhraseProperty.h</locationURI>
+ </link>
+ <link>
<name>PP/TreeStructurePhraseProperty.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TreeStructurePhraseProperty.h</locationURI>
</link>
<link>
+ <name>Syntax/BoundedPriorityContainer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/BoundedPriorityContainer.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/Cube.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/Cube.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/Cube.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/Cube.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/CubeQueue.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/CubeQueue.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/CubeQueue.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/CubeQueue.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/KBestExtractor.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/KBestExtractor.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/KBestExtractor.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/KBestExtractor.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/NonTerminalMap.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/NonTerminalMap.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/PHyperedge.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/PHyperedge.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/PVertex.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/PVertex.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/RuleTable.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/RuleTable.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/RuleTableFF.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/RuleTableFF.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/RuleTableFF.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/RuleTableFF.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SHyperedge.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SHyperedge.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SHyperedge.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SHyperedge.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SHyperedgeBundle.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SHyperedgeBundle.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SHyperedgeBundleScorer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SHyperedgeBundleScorer.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SVertex.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SVertex.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SVertex.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SVertex.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SVertexRecombinationOrderer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SVertexRecombinationOrderer.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SVertexStack.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SVertexStack.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SymbolEqualityPred.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SymbolEqualityPred.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/SymbolHasher.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SymbolHasher.h</locationURI>
+ </link>
+ <link>
<name>TranslationModel/BilingualDynSuffixArray.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/BilingualDynSuffixArray.cpp</locationURI>
@@ -1672,6 +1986,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionary.h</locationURI>
</link>
<link>
+ <name>TranslationModel/PhraseDictionaryDynSuffixArray.README</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README</locationURI>
+ </link>
+ <link>
<name>TranslationModel/PhraseDictionaryDynSuffixArray.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp</locationURI>
@@ -1762,6 +2081,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryTreeAdaptor.h</locationURI>
</link>
<link>
+ <name>TranslationModel/ProbingPT</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>TranslationModel/RuleTable</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
@@ -1782,6 +2106,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/SkeletonPT.h</locationURI>
</link>
<link>
+ <name>TranslationModel/UG</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>TranslationModel/WordCoocTable.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.cpp</locationURI>
@@ -1797,24 +2126,29 @@
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test</name>
- <type>2</type>
- <locationURI>virtual:/virtual</locationURI>
- </link>
- <link>
<name>bin/factor.log</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/bin/factor.log</locationURI>
</link>
<link>
- <name>bin/gcc-4.7</name>
+ <name>bin/gcc-4.8</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/lm.log</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/lm.log</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/lm.log</locationURI>
+ </link>
+ <link>
+ <name>extract-ghkm/PhraseOrientation.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.cpp</locationURI>
+ </link>
+ <link>
+ <name>extract-ghkm/PhraseOrientation.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LexicalReordering.cpp</name>
@@ -1847,6 +2181,36 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LexicalReorderingTable.h</locationURI>
</link>
<link>
+ <name>FF/LexicalReordering/ReorderingStack.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/ReorderingStack.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/LexicalReordering/ReorderingStack.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/ReorderingStack.h</locationURI>
+ </link>
+ <link>
+ <name>FF/LexicalReordering/SparseReordering.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/SparseReordering.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/LexicalReordering/SparseReordering.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/SparseReordering.h</locationURI>
+ </link>
+ <link>
+ <name>FF/OSM-Feature/KenOSM.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/KenOSM.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/OSM-Feature/KenOSM.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/KenOSM.h</locationURI>
+ </link>
+ <link>
<name>FF/OSM-Feature/OpSequenceModel.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/OpSequenceModel.cpp</locationURI>
@@ -1867,6 +2231,171 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/osmHyp.h</locationURI>
</link>
<link>
+ <name>LM/bilingual-lm/BiLM_NPLM.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bilingual-lm/BiLM_NPLM.cpp</locationURI>
+ </link>
+ <link>
+ <name>LM/bilingual-lm/BiLM_NPLM.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bilingual-lm/BiLM_NPLM.h</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/gcc-4.8</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm/OxLM.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLM.cpp</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm/OxLM.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLM.h</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm/OxLMMapper.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLMMapper.cpp</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm/OxLMMapper.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLMMapper.h</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm/OxLMParallelMapper.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLMParallelMapper.cpp</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm/OxLMParallelMapper.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/OxLMParallelMapper.h</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm/SourceOxLM.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/SourceOxLM.cpp</locationURI>
+ </link>
+ <link>
+ <name>LM/oxlm/SourceOxLM.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/SourceOxLM.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/DerivationWriter.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/DerivationWriter.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/DerivationWriter.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/DerivationWriter.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Manager-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Manager-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Manager.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Manager.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/OovHandler-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/OovHandler-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/OovHandler.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/OovHandler.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/PChart.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/PChart.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/PChart.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/PChart.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/PHyperedgeToSHyperedgeBundle.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/ParserCallback.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/ParserCallback.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrie.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrie.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieCYKPlus.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieCYKPlus.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieCYKPlus.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieCYKPlus.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieCreator.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieCreator.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieLoader.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieLoader.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieLoader.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieLoader.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieScope3.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieScope3.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/RuleTrieScope3.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/RuleTrieScope3.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/SChart.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/SChart.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/SChart.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/SChart.h</locationURI>
+ </link>
+ <link>
<name>TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp</locationURI>
@@ -2172,6 +2701,96 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/DynSAInclude/vocab.h</locationURI>
</link>
<link>
+ <name>TranslationModel/ProbingPT/Jamfile</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/Jamfile</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/ProbingPT.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/ProbingPT.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/hash.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/hash.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/huffmanish.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/huffmanish.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.hh</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/line_splitter.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/line_splitter.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/line_splitter.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/line_splitter.hh</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/probing_hash_utils.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/probing_hash_utils.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/probing_hash_utils.hh</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/quering.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/quering.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.hh</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/storing.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/storing.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/storing.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/storing.hh</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/tests</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/vocabid.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/vocabid.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/vocabid.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/vocabid.hh</locationURI>
+ </link>
+ <link>
<name>TranslationModel/RuleTable/Loader.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/RuleTable/Loader.h</locationURI>
@@ -2342,6 +2961,161 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h</locationURI>
</link>
<link>
+ <name>TranslationModel/UG/Jamfile</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/Jamfile</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/Makefile</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/Makefile</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/bin</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/count-ptable-features.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/count-ptable-features.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mmsapt.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mmsapt.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mmsapt.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mmsapt.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mmsapt_align.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mmsapt_align.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/ptable-describe-features.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/ptable-describe-features.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/ptable-lookup.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/ptable-lookup.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_phrase_key.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_phrase_key.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_phrase_scorers.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_phrase_scorers.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_base.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_base.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_coherence.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_lex1.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_lex1.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_logcnt.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_logcnt.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_pbwd.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_pbwd.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_pfwd.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_pfwd.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_phrasecount.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_phrasecount.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_provenance.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_provenance.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_rareness.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_rareness.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_unaligned.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_unaligned.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sim-pe.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sim-pe.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/spe-check-coverage.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/spe-check-coverage.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/spe-check-coverage2.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/spe-check-coverage2.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/spe-check-coverage3.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/spe-check-coverage3.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/try-align.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/try-align.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/try-align2.cc</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/try-align2.cc</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/util</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>TranslationModel/fuzzy-match/Alignments.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/fuzzy-match/Alignments.cpp</locationURI>
@@ -2407,1039 +3181,2269 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/fuzzy-match/create_xml.h</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7</name>
+ <name>bin/gcc-4.8/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release</name>
+ <name>LM/bin/BackwardTest.test/gcc-4.8</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7</name>
+ <name>LM/bin/gcc-4.8/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/pt.log</name>
+ <name>Syntax/S2T/Parsers/Parser.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/pt.log</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Parser.h</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release</name>
+ <name>Syntax/S2T/Parsers/RecursiveCYKPlusParser</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release</name>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on</name>
+ <name>TranslationModel/ProbingPT/tests/tokenization_tests.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/tests/tokenization_tests.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/tests/vocabid_test.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/tests/vocabid_test.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/bin/gcc-4.8</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static</name>
+ <name>TranslationModel/UG/generic/Jamfile</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/Jamfile</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/bin</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on</name>
+ <name>TranslationModel/UG/generic/file_io</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static</name>
+ <name>TranslationModel/UG/generic/program_options</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi</name>
+ <name>TranslationModel/UG/generic/sampling</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static</name>
+ <name>TranslationModel/UG/generic/sorting</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi</name>
+ <name>TranslationModel/UG/generic/stringdist</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/AlignmentInfo.o</name>
+ <name>TranslationModel/UG/generic/threading</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/Jamfile</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/AlignmentInfo.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/Jamfile</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoCollection.o</name>
+ <name>TranslationModel/UG/mm/Makefile</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoCollection.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/Makefile</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoTest.o</name>
+ <name>TranslationModel/UG/mm/calc-coverage.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoTest.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/calc-coverage.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Backward.o</name>
+ <name>TranslationModel/UG/mm/custom-pt.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Backward.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/custom-pt.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardLMState.o</name>
+ <name>TranslationModel/UG/mm/mam2symal.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardLMState.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/mam2symal.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Base.o</name>
+ <name>TranslationModel/UG/mm/mam_verify.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Base.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/mam_verify.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BitmapContainer.o</name>
+ <name>TranslationModel/UG/mm/mmlex-build.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BitmapContainer.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/mmlex-build.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BleuScoreFeature.o</name>
+ <name>TranslationModel/UG/mm/mmlex-lookup.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BleuScoreFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/mmlex-lookup.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartCell.o</name>
+ <name>TranslationModel/UG/mm/mtt-build.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartCell.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/mtt-build.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartCellCollection.o</name>
+ <name>TranslationModel/UG/mm/mtt-count-words.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartCellCollection.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/mtt-count-words.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartHypothesis.o</name>
+ <name>TranslationModel/UG/mm/mtt-demo1.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartHypothesis.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/mtt-demo1.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartHypothesisCollection.o</name>
+ <name>TranslationModel/UG/mm/mtt-dump.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartHypothesisCollection.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/mtt-dump.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartManager.o</name>
+ <name>TranslationModel/UG/mm/mtt.count.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartManager.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/mtt.count.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartParser.o</name>
+ <name>TranslationModel/UG/mm/num_read_write.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartParser.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/num_read_write.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptionList.o</name>
+ <name>TranslationModel/UG/mm/num_read_write.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptionList.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/num_read_write.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/obsolete</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptions.o</name>
+ <name>TranslationModel/UG/mm/symal2mam.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptions.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/symal2mam.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTrellisDetour.o</name>
+ <name>TranslationModel/UG/mm/test-dynamic-im-tsa.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTrellisDetour.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTrellisDetourQueue.o</name>
+ <name>TranslationModel/UG/mm/tpt_pickler.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTrellisDetourQueue.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/tpt_pickler.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTrellisNode.o</name>
+ <name>TranslationModel/UG/mm/tpt_pickler.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTrellisNode.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/tpt_pickler.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTrellisPath.o</name>
+ <name>TranslationModel/UG/mm/tpt_tightindex.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ChartTrellisPath.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/tpt_tightindex.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ConfusionNet.o</name>
+ <name>TranslationModel/UG/mm/tpt_tightindex.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ConfusionNet.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/tpt_tightindex.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeFeature.o</name>
+ <name>TranslationModel/UG/mm/tpt_tokenindex.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/tpt_tokenindex.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeGraph.o</name>
+ <name>TranslationModel/UG/mm/tpt_tokenindex.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeGraph.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/tpt_tokenindex.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeStep.o</name>
+ <name>TranslationModel/UG/mm/tpt_typedefs.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeStep.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/tpt_typedefs.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeStepGeneration.o</name>
+ <name>TranslationModel/UG/mm/ug_bitext.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeStepGeneration.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_bitext.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeStepTranslation.o</name>
+ <name>TranslationModel/UG/mm/ug_bitext.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DecodeStepTranslation.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_bitext.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Dictionary.o</name>
+ <name>TranslationModel/UG/mm/ug_conll_bottom_up_token.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Dictionary.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DummyScoreProducers.o</name>
+ <name>TranslationModel/UG/mm/ug_conll_record.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/DummyScoreProducers.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_conll_record.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FFState.o</name>
+ <name>TranslationModel/UG/mm/ug_conll_record.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FFState.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_conll_record.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Factor.o</name>
+ <name>TranslationModel/UG/mm/ug_corpus_token.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Factor.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_corpus_token.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FactorCollection.o</name>
+ <name>TranslationModel/UG/mm/ug_corpus_token.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FactorCollection.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_corpus_token.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FactorTypeSet.o</name>
+ <name>TranslationModel/UG/mm/ug_deptree.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FactorTypeSet.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_deptree.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Factory.o</name>
+ <name>TranslationModel/UG/mm/ug_deptree.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Factory.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_deptree.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FeatureFunction.o</name>
+ <name>TranslationModel/UG/mm/ug_im_tsa.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FeatureFunction.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_im_tsa.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FeatureVector.o</name>
+ <name>TranslationModel/UG/mm/ug_im_ttrack.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FeatureVector.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_im_ttrack.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FeatureVectorTest.o</name>
+ <name>TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FeatureVectorTest.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/File.o</name>
+ <name>TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/File.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FloydWarshall.o</name>
+ <name>TranslationModel/UG/mm/ug_load_primer.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/FloydWarshall.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_load_primer.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/GenerationDictionary.o</name>
+ <name>TranslationModel/UG/mm/ug_load_primer.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/GenerationDictionary.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_load_primer.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/GlobalLexicalModel.o</name>
+ <name>TranslationModel/UG/mm/ug_lru_cache.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/GlobalLexicalModel.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_lru_cache.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/GlobalLexicalModelUnlimited.o</name>
+ <name>TranslationModel/UG/mm/ug_mm_2d_table.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/GlobalLexicalModelUnlimited.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_mm_2d_table.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Hypothesis.o</name>
+ <name>TranslationModel/UG/mm/ug_mm_tsa.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Hypothesis.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_mm_tsa.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/HypothesisStack.o</name>
+ <name>TranslationModel/UG/mm/ug_mm_tsa_tree_iterator.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/HypothesisStack.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_mm_tsa_tree_iterator.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/HypothesisStackCubePruning.o</name>
+ <name>TranslationModel/UG/mm/ug_mm_ttrack.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/HypothesisStackCubePruning.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_mm_ttrack.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/HypothesisStackNormal.o</name>
+ <name>TranslationModel/UG/mm/ug_mmbitext.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/HypothesisStackNormal.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_mmbitext.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/IRST.o</name>
+ <name>TranslationModel/UG/mm/ug_mmbitext.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/IRST.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_mmbitext.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Implementation.o</name>
+ <name>TranslationModel/UG/mm/ug_phrasepair.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Implementation.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_phrasepair.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Incremental.o</name>
+ <name>TranslationModel/UG/mm/ug_phrasepair.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Incremental.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_phrasepair.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/InputFileStream.o</name>
+ <name>TranslationModel/UG/mm/ug_tsa_array_entry.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/InputFileStream.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_tsa_array_entry.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/InputType.o</name>
+ <name>TranslationModel/UG/mm/ug_tsa_array_entry.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/InputType.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Joint.o</name>
+ <name>TranslationModel/UG/mm/ug_tsa_base.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Joint.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_tsa_base.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Ken.o</name>
+ <name>TranslationModel/UG/mm/ug_tsa_bitset_cache.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Ken.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LMList.o</name>
+ <name>TranslationModel/UG/mm/ug_tsa_tree_iterator.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LMList.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LVoc.o</name>
+ <name>TranslationModel/UG/mm/ug_ttrack_base.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LVoc.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_ttrack_base.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReordering.o</name>
+ <name>TranslationModel/UG/mm/ug_ttrack_base.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReordering.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_ttrack_base.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingState.o</name>
+ <name>TranslationModel/UG/mm/ug_ttrack_position.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingState.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_ttrack_position.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTable.o</name>
+ <name>TranslationModel/UG/mm/ug_ttrack_position.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTable.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_ttrack_position.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Manager.o</name>
+ <name>TranslationModel/UG/mm/ug_typedefs.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Manager.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/ug_typedefs.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MockHypothesis.o</name>
+ <name>TranslationModel/UG/util/Makefile</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MockHypothesis.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/util/Makefile</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MosesTest.o</name>
+ <name>TranslationModel/UG/util/ibm1-align</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MosesTest.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/util/ibm1-align</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o</name>
+ <name>TranslationModel/UG/util/ibm1-align.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/util/ibm1-align.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/NonTerminal.o</name>
+ <name>TranslationModel/UG/util/tokenindex.dump.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/NonTerminal.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/util/tokenindex.dump.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ORLM.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/gcc-4.8/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ORLM.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PCNTools.o</name>
+ <name>Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PCNTools.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ParallelBackoff.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ParallelBackoff.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Parameter.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/Parser.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Parameter.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PartialTranslOptColl.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PartialTranslOptColl.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Phrase.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Phrase.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseBoundaryFeature.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseBoundaryFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseLengthFeature.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseLengthFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseLengthFeatureTest.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseLengthFeatureTest.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhrasePairFeature.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhrasePairFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PrefixTreeMap.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/TailLattice.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PrefixTreeMap.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Remote.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Remote.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ReorderingConstraint.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ReorderingConstraint.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ReorderingStack.o</name>
+ <name>Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ReorderingStack.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/bin/gcc-4.8/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/RuleCube.o</name>
+ <name>TranslationModel/UG/generic/bin/gcc-4.8</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/file_io/ug_stream.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/RuleCube.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/RuleCubeItem.o</name>
+ <name>TranslationModel/UG/generic/file_io/ug_stream.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/RuleCubeItem.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/file_io/ug_stream.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/RuleCubeQueue.o</name>
+ <name>TranslationModel/UG/generic/program_options/ug_get_options.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/RuleCubeQueue.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SRI.o</name>
+ <name>TranslationModel/UG/generic/program_options/ug_get_options.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SRI.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/program_options/ug_get_options.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollection.o</name>
+ <name>TranslationModel/UG/generic/program_options/ug_splice_arglist.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollection.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollectionTest.o</name>
+ <name>TranslationModel/UG/generic/program_options/ug_splice_arglist.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollectionTest.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ScoreProducer.o</name>
+ <name>TranslationModel/UG/generic/sampling/Sampling.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ScoreProducer.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/sampling/Sampling.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Search.o</name>
+ <name>TranslationModel/UG/generic/sorting/NBestList.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Search.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/sorting/NBestList.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SearchCubePruning.o</name>
+ <name>TranslationModel/UG/generic/sorting/VectorIndexSorter.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SearchCubePruning.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SearchNormal.o</name>
+ <name>TranslationModel/UG/generic/stringdist/ug_stringdist.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SearchNormal.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SearchNormalBatch.o</name>
+ <name>TranslationModel/UG/generic/stringdist/ug_stringdist.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SearchNormalBatch.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Sentence.o</name>
+ <name>TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Sentence.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SentenceStats.o</name>
+ <name>TranslationModel/UG/generic/threading/ug_thread_safe_counter.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SentenceStats.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o</name>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/obsolete/ug_bitext_base.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SourceWordDeletionFeature.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/gcc-4.8/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/bin/gcc-4.8/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/AlignmentInfo.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/AlignmentInfo.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoCollection.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoCollection.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoTest.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoTest.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Backward.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Backward.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardLMState.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SourceWordDeletionFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardLMState.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SparsePhraseDictionaryFeature.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Base.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SparsePhraseDictionaryFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Base.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SquareMatrix.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BilingualLM.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SquareMatrix.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BilingualLM.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/StaticData.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BitmapContainer.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/StaticData.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BitmapContainer.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetBigramFeature.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartCell.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetBigramFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartCell.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetBigramFeatureTest.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartCellCollection.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetBigramFeatureTest.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartCellCollection.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetNgramFeature.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartHypothesis.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetNgramFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartHypothesis.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetPhrase.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartHypothesisCollection.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetPhrase.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartHypothesisCollection.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetPhraseCollection.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartKBestExtractor.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetPhraseCollection.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartKBestExtractor.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetWordInsertionFeature.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartManager.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TargetWordInsertionFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartManager.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ThreadPool.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartParser.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ThreadPool.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartParser.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Timer.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManager.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Timer.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManager.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOption.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOption.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptionList.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptionList.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptions.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptions.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ConfusionNet.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ConfusionNet.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DecodeGraph.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DecodeGraph.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DecodeStep.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DecodeStep.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DecodeStepGeneration.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DecodeStepGeneration.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DecodeStepTranslation.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DecodeStepTranslation.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF_Factory.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF_Factory.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Factor.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Factor.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FactorCollection.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FactorCollection.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FactorTypeSet.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FactorTypeSet.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureVector.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureVector.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureVectorTest.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureVectorTest.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/File.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/File.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FloydWarshall.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FloydWarshall.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/GenerationDictionary.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/GenerationDictionary.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypergraphOutput.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypergraphOutput.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Hypothesis.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Hypothesis.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypothesisStack.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypothesisStack.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypothesisStackCubePruning.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypothesisStackCubePruning.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypothesisStackNormal.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypothesisStackNormal.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/IOWrapper.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/IOWrapper.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Implementation.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Implementation.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Incremental.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Incremental.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/InputFileStream.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/InputFileStream.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/InputPath.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/InputPath.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/InputType.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/InputType.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Joint.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Joint.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Ken.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Ken.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/LVoc.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/LVoc.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/LatticeMBR.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/LatticeMBR.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Manager.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Manager.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MockHypothesis.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MockHypothesis.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MosesTest.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MosesTest.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/NonTerminal.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/NonTerminal.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PCNTools.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PCNTools.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PDTAimp.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PDTAimp.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOption.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Parameter.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Parameter.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PartialTranslOptColl.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PartialTranslOptColl.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Phrase.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Phrase.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PhraseOrientation.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOption.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PhraseOrientation.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollection.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PrefixTreeMap.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollection.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PrefixTreeMap.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionConfusionNet.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Remote.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionConfusionNet.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Remote.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionText.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ReorderingConstraint.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionText.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ReorderingConstraint.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOptionList.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/RuleCube.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationOptionList.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/RuleCube.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationSystem.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/RuleCubeItem.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationSystem.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/RuleCubeItem.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TreeInput.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/RuleCubeQueue.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TreeInput.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/RuleCubeQueue.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TrellisPath.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollection.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TrellisPath.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollection.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TrellisPathCollection.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollectionTest.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TrellisPathCollection.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollectionTest.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/UserMessage.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Search.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/UserMessage.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Search.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Util.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SearchCubePruning.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Util.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SearchCubePruning.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Word.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SearchNormal.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Word.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SearchNormal.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/WordLattice.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SearchNormalBatch.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/WordLattice.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SearchNormalBatch.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/WordTranslationFeature.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Sentence.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/WordTranslationFeature.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Sentence.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/WordsBitmap.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SentenceStats.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/WordsBitmap.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SentenceStats.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/WordsRange.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/WordsRange.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/XmlOption.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SkeletonLM.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/XmlOption.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SkeletonLM.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/libmoses.a</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SquareMatrix.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/libmoses.a</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SquareMatrix.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/moses_test</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/StaticData.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/moses_test</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/StaticData.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/moses_test.passed</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TargetPhrase.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/moses_test.passed</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TargetPhrase.o</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TargetPhraseCollection.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TargetPhraseCollection.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ThreadPool.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ThreadPool.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Timer.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Timer.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationAnalysis.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationAnalysis.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Backward.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOption.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOption.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollection.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollection.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionConfusionNet.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionConfusionNet.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionLattice.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionLattice.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionText.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionText.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionList.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationOptionList.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationTask.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationTask.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TreeInput.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TreeInput.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TrellisPath.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TrellisPath.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TrellisPathCollection.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TrellisPathCollection.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/UserMessage.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/UserMessage.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Util.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Util.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Word.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Word.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/WordLattice.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/WordLattice.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/WordsBitmap.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/WordsBitmap.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/WordsRange.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/WordsRange.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/XmlOption.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/XmlOption.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/libmoses.a</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/libmoses.a</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mbr.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mbr.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/moses_test</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/moses_test</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/moses_test.passed</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/moses_test.passed</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/IRST.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/IRST.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ORLM.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ORLM.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Rand.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Rand.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/BleuScoreFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/BleuScoreFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/ConstrainedDecoding.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/ConstrainedDecoding.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/ControlRecombination.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/ControlRecombination.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/CountNonTerms.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/CountNonTerms.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/CoveredReferenceFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/CoveredReferenceFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/DecodeFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/DecodeFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/DistortionScoreProducer.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/DistortionScoreProducer.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/ExternalFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/ExternalFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/FFState.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/FFState.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/FeatureFunction.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/FeatureFunction.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/GlobalLexicalModel.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/GlobalLexicalModel.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/GlobalLexicalModelUnlimited.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/GlobalLexicalModelUnlimited.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/HyperParameterAsWeight.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/HyperParameterAsWeight.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/InputFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/InputFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/InternalTree.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/InternalTree.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/MaxSpanFreeNonTermSource.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/MaxSpanFreeNonTermSource.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/NieceTerminal.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/NieceTerminal.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/OSM-Feature</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhraseBoundaryFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhraseBoundaryFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhraseLengthFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhraseLengthFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhraseOrientationFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhraseOrientationFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhrasePairFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhrasePairFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhrasePenalty.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/PhrasePenalty.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/ReferenceComparison.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/ReferenceComparison.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/RuleScope.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/RuleScope.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SetSourcePhrase.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SetSourcePhrase.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SkeletonChangeInput.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SkeletonChangeInput.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SkeletonStatefulFF.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SkeletonStatefulFF.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SkeletonStatelessFF.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SkeletonStatelessFF.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SoftMatchingFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SoftMatchingFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SoftSourceSyntacticConstraintsFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SoftSourceSyntacticConstraintsFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SourceGHKMTreeInputMatchFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SourceGHKMTreeInputMatchFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SourceWordDeletionFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SourceWordDeletionFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SpanLength.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SpanLength.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SparseHieroReorderingFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SparseHieroReorderingFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SparseHieroReorderingFeatureTest.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SparseHieroReorderingFeatureTest.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/StatefulFeatureFunction.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/StatefulFeatureFunction.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/StatelessFeatureFunction.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/StatelessFeatureFunction.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SyntaxRHS.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/SyntaxRHS.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/TargetBigramFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/TargetBigramFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/TargetNgramFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/TargetNgramFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/TargetWordInsertionFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/TargetWordInsertionFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/TreeStructureFeature.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/TreeStructureFeature.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/UnknownWordPenaltyProducer.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/UnknownWordPenaltyProducer.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/WordPenaltyProducer.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Backward.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/WordPenaltyProducer.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardLMState.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/WordTranslationFeature.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardLMState.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/WordTranslationFeature.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/CountsPhraseProperty.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/CountsPhraseProperty.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/Factory.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/Factory.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest.output</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/NonTermContextProperty.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest.output</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/NonTermContextProperty.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest.run</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/OrientationPhraseProperty.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest.run</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/OrientationPhraseProperty.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest.test</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/PhraseProperty.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BackwardTest.test</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/PhraseProperty.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Base.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/SourceLabelsPhraseProperty.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Base.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/SourceLabelsPhraseProperty.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Implementation.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/SpanLengthPhraseProperty.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Implementation.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PP/SpanLengthPhraseProperty.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Joint.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/Cube.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Joint.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/Cube.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Ken.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/CubeQueue.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Ken.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/CubeQueue.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/KBestExtractor.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/KBestExtractor.o</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Remote.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/RuleTableFF.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/Remote.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/RuleTableFF.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/SHyperedge.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/SHyperedge.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/BilingualDynSuffixArray.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/SVertex.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/BilingualDynSuffixArray.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/SVertex.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/BilingualDynSuffixArray.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/BilingualDynSuffixArray.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSuffixArray.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSuffixArray.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSuffixArray.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSuffixArray.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionary.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionary.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionary.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionary.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryDynSuffixArray.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryDynSuffixArray.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryDynSuffixArray.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryDynSuffixArray.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMemory.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMemory.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMemory.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMemory.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMultiModel.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMultiModel.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMultiModel.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMultiModel.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMultiModelCounts.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMultiModelCounts.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMultiModelCounts.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryMultiModelCounts.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryNode.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryNodeMemory.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryNode.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryNodeMemory.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTree.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryScope3.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTree.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryScope3.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTreeAdaptor.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTransliteration.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTreeAdaptor.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTransliteration.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTree.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTree.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTreeAdaptor.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/PhraseDictionaryTreeAdaptor.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/SkeletonPT.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/SkeletonPT.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/WordCoocTable.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/WordCoocTable.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BlockHashIndex.o</name>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Backward.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Backward.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardLMState.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardLMState.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest.output</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest.output</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest.run</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest.run</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest.test</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BackwardTest.test</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Base.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Base.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BilingualLM.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BilingualLM.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Implementation.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Implementation.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Joint.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Joint.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Ken.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Ken.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Remote.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Remote.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o</locationURI>
+ </link>
+ <link>
+ <name>LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SkeletonLM.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/BackwardTest.test/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SkeletonLM.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BlockHashIndex.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BlockHashIndex.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/CmphStringVectorAdapter.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/CmphStringVectorAdapter.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTableCompact.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTableCompact.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTableCreator.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTableCreator.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MurmurHash3.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MurmurHash3.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PhraseDecoder.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/BlockHashIndex.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PhraseDecoder.o</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/CmphStringVectorAdapter.o</name>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryCompact.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/CmphStringVectorAdapter.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryCompact.o</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTableCompact.o</name>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PhraseTableCreator.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTableCompact.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PhraseTableCreator.o</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTableCreator.o</name>
+ <name>TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ThrowingFwrite.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTableCreator.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ThrowingFwrite.o</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MurmurHash3.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/count-ptable-features</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/MurmurHash3.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/count-ptable-features</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseDecoder.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/count-ptable-features.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseDecoder.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/count-ptable-features.o</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryCompact.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmsapt.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryCompact.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmsapt.o</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseTableCreator.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmsapt_align.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/PhraseTableCreator.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmsapt_align.o</locationURI>
</link>
<link>
- <name>TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ThrowingFwrite.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-describe-features</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/ThrowingFwrite.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-describe-features</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-describe-features.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-describe-features.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/spe-check-coverage</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/spe-check-coverage</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/DotChartInMemory.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/spe-check-coverage.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/DotChartInMemory.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/spe-check-coverage.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/DotChartOnDisk.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/try-align</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/DotChartOnDisk.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/try-align</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/FileHandler.o</name>
+ <name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/try-align.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/FileHandler.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/try-align.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/params.o</name>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/LexicalReordering.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/params.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/LexicalReordering.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/LexicalReorderingState.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/LexicalReorderingState.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/LexicalReorderingTable.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/LexicalReorderingTable.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/ReorderingStack.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/ReorderingStack.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/SparseReordering.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/LexicalReordering/SparseReordering.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/OSM-Feature/KenOSM.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/OSM-Feature/KenOSM.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/OSM-Feature/OpSequenceModel.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/OSM-Feature/OpSequenceModel.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/OSM-Feature/osmHyp.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FF/OSM-Feature/osmHyp.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/DerivationWriter.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/DerivationWriter.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/PChart.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/PChart.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/Parsers</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/vocab.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/RuleTrieCYKPlus.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/vocab.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/RuleTrieCYKPlus.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderCompact.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/RuleTrieLoader.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderCompact.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/RuleTrieLoader.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderFactory.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/RuleTrieScope3.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderFactory.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/RuleTrieScope3.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderHiero.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/SChart.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderHiero.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/SChart.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderStandard.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderStandard.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryMemory.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryMemory.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryNodeMemory.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryNodeMemory.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryOnDisk.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/CompletedRuleCollection.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryOnDisk.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/CompletedRuleCollection.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/Trie.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/DotChartInMemory.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/Trie.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/DotChartInMemory.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/UTrie.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/DotChartOnDisk.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/UTrie.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/CYKPlusParser/DotChartOnDisk.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/UTrieNode.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/FileHandler.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/UTrieNode.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/FileHandler.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/ApplicableRuleTrie.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/params.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/ApplicableRuleTrie.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/params.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/Parser.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/vocab.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/Parser.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/DynSAInclude/vocab.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/StackLatticeBuilder.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderCompact.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/StackLatticeBuilder.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderCompact.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/VarSpanTrieBuilder.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderFactory.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/VarSpanTrieBuilder.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderFactory.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/Alignments.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderHiero.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/Alignments.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderHiero.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/FuzzyMatchWrapper.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderStandard.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/FuzzyMatchWrapper.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/LoaderStandard.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/SentenceAlignment.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/SentenceAlignment.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryOnDisk.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/PhraseDictionaryOnDisk.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/Trie.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/Trie.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/UTrie.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/UTrie.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/UTrieNode.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/RuleTable/UTrieNode.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/ApplicableRuleTrie.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/ApplicableRuleTrie.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/Parser.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/Parser.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/StackLatticeBuilder.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/StackLatticeBuilder.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/VarSpanTrieBuilder.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/Scope3Parser/VarSpanTrieBuilder.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/Alignments.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/Alignments.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/FuzzyMatchWrapper.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/FuzzyMatchWrapper.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/SentenceAlignment.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/SentenceAlignment.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/SuffixArray.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/SuffixArray.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/Vocabulary.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/Vocabulary.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/create_xml.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/create_xml.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_get_options.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_get_options.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_splice_arglist.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_splice_arglist.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_stream.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_stream.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_thread_safe_counter.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/generic/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_thread_safe_counter.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/calc-coverage</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/calc-coverage</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/calc-coverage.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/calc-coverage.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mam2symal</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mam2symal</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mam2symal.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mam2symal.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mam_verify</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mam_verify</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mam_verify.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mam_verify.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmlex-build</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmlex-build</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmlex-build.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmlex-build.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmlex-lookup</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmlex-lookup</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmlex-lookup.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mmlex-lookup.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-build</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-build</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-build.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-build.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-count-words</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-count-words</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-count-words.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-count-words.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-demo1</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-demo1</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-demo1.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-demo1.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-dump</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-dump</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-dump.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mtt-dump.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/num_read_write.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/num_read_write.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/symal2mam</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/symal2mam</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/symal2mam.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/symal2mam.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/tpt_pickler.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/tpt_pickler.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/tpt_tightindex.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/tpt_tightindex.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/tpt_tokenindex.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/tpt_tokenindex.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_bitext.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_bitext.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_conll_record.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_conll_record.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_corpus_token.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_corpus_token.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_deptree.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_deptree.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_load_primer.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_load_primer.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_mmbitext.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_mmbitext.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_phrasepair.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_phrasepair.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_tsa_array_entry.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_tsa_array_entry.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_ttrack_base.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_ttrack_base.o</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_ttrack_position.o</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/mm/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ug_ttrack_position.o</locationURI>
+ </link>
+ <link>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/Parsers/Scope3Parser</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/SuffixArray.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/SuffixArray.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/Vocabulary.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/Vocabulary.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.o</locationURI>
</link>
<link>
- <name>bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/create_xml.o</name>
+ <name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.o</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/TranslationModel/fuzzy-match/create_xml.o</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.o</locationURI>
</link>
</linkedResources>
</projectDescription>
diff --git a/contrib/other-builds/score/.cproject b/contrib/other-builds/score/.cproject
index f51f35ef5..044fad896 100644
--- a/contrib/other-builds/score/.cproject
+++ b/contrib/other-builds/score/.cproject
@@ -1,62 +1,74 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.634831890">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.634831890" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.852684782">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.852684782" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.634831890" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.634831890." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.1361730953" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.2040884960" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/score/Debug}" id="cdt.managedbuild.builder.gnu.cross.1709170788" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.786339685" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1516054114" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.1061705384" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2108019237" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
- </tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1013232238" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.1874109813" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.2032778777" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.include.paths.1713606194" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.852684782" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.852684782." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.628760407" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.40031730" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/score}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.1494414913" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1369030665" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1299858559" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1103483066" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.11930558" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.1147799314" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.509920006" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1638578889" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1279743060" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1563503789" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <option id="gnu.cpp.link.option.paths.1704292838" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.2096513387" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.1877980632" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.1972289345" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1767499123" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.9477188" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1008235812" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.paths.2139594100" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
- <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/OnDiskPt/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/search/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="/opt/local/lib"/>
</option>
- <option id="gnu.cpp.link.option.libs.936233947" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
- <listOptionValue builtIn="false" value="z"/>
+ <option id="gnu.cpp.link.option.libs.615408765" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="moses"/>
+ <listOptionValue builtIn="false" value="search"/>
+ <listOptionValue builtIn="false" value="OnDiskPt"/>
+ <listOptionValue builtIn="false" value="lm"/>
<listOptionValue builtIn="false" value="util"/>
- <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
- <listOptionValue builtIn="false" value="boost_system-mt"/>
- <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
+ <listOptionValue builtIn="false" value="boost_serialization"/>
+ <listOptionValue builtIn="false" value="boost_system"/>
+ <listOptionValue builtIn="false" value="boost_thread"/>
+ <listOptionValue builtIn="false" value="boost_filesystem"/>
+ <listOptionValue builtIn="false" value="pthread"/>
+ <listOptionValue builtIn="false" value="z"/>
+ <listOptionValue builtIn="false" value="bz2"/>
+ <listOptionValue builtIn="false" value="dl"/>
<listOptionValue builtIn="false" value="rt"/>
</option>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.589709979" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.202044854" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.1829423265" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.52947560" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1165474354" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1832317688" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1877599289" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -64,44 +76,44 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
- <cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.1994357180">
- <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1994357180" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.1878418244">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1878418244" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
- <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.1994357180" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
- <folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.1994357180." name="/" resourcePath="">
- <toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.743463783" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
- <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1353054437" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
- <builder buildPath="${workspace_loc:/score/Release}" id="cdt.managedbuild.builder.gnu.cross.1851758128" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.323743241" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
- <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.534423111" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
- <option id="gnu.c.compiler.option.debugging.level.518786530" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.392640311" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1878418244" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.1878418244." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1661678477" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.848161857" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/score}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.1694318208" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1857970512" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.464441024" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.1302447353" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.143379331" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.859419943" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.307472312" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
- <option id="gnu.cpp.compiler.option.optimization.level.407718562" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
- <option id="gnu.cpp.compiler.option.debugging.level.1687450255" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
- <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.593478428" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1103707928" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.2144910639" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.158963791" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.558236570" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.c.linker.165176764" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.178129273" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
- <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.25375344" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.1915067544" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.2131232485" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.530558382" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
- <tool id="cdt.managedbuild.tool.gnu.cross.archiver.986435372" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
- <tool id="cdt.managedbuild.tool.gnu.cross.assembler.1833814398" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
- <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1026471548" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.1037806386" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.2129474260" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
@@ -111,23 +123,30 @@
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
- <project id="score.cdt.managedbuild.target.gnu.cross.exe.1539177197" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+ <project id="score.cdt.managedbuild.target.gnu.exe.812812835" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.634831890;cdt.managedbuild.config.gnu.cross.exe.debug.634831890.;cdt.managedbuild.tool.gnu.cross.c.compiler.786339685;cdt.managedbuild.tool.gnu.c.compiler.input.2108019237">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1878418244;cdt.managedbuild.config.gnu.exe.release.1878418244.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1103707928;cdt.managedbuild.tool.gnu.c.compiler.input.558236570">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1994357180;cdt.managedbuild.config.gnu.cross.exe.release.1994357180.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.307472312;cdt.managedbuild.tool.gnu.cpp.compiler.input.593478428">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.852684782;cdt.managedbuild.config.gnu.exe.debug.852684782.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.2096513387;cdt.managedbuild.tool.gnu.c.compiler.input.1767499123">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.634831890;cdt.managedbuild.config.gnu.cross.exe.debug.634831890.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1013232238;cdt.managedbuild.tool.gnu.cpp.compiler.input.509920006">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1878418244;cdt.managedbuild.config.gnu.exe.release.1878418244.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.464441024;cdt.managedbuild.tool.gnu.cpp.compiler.input.859419943">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
- <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1994357180;cdt.managedbuild.config.gnu.cross.exe.release.1994357180.;cdt.managedbuild.tool.gnu.cross.c.compiler.323743241;cdt.managedbuild.tool.gnu.c.compiler.input.392640311">
- <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.852684782;cdt.managedbuild.config.gnu.exe.debug.852684782.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1299858559;cdt.managedbuild.tool.gnu.cpp.compiler.input.1638578889">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/score"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/score"/>
+ </configuration>
+ </storageModule>
</cproject>
diff --git a/contrib/other-builds/score/.project b/contrib/other-builds/score/.project
index 05564d0f9..10e713124 100644
--- a/contrib/other-builds/score/.project
+++ b/contrib/other-builds/score/.project
@@ -88,16 +88,6 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/ScoreFeature.h</locationURI>
</link>
<link>
- <name>exception.cc</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/util/exception.cc</locationURI>
- </link>
- <link>
- <name>exception.hh</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/util/exception.hh</locationURI>
- </link>
- <link>
<name>score-main.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/score-main.cpp</locationURI>
diff --git a/contrib/other-builds/server/.cproject b/contrib/other-builds/server/.cproject
new file mode 100644
index 000000000..d971684d8
--- /dev/null
+++ b/contrib/other-builds/server/.cproject
@@ -0,0 +1,169 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+ <storageModule moduleId="org.eclipse.cdt.core.settings">
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.1015532240">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1015532240" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+ <externalSettings/>
+ <extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ </extensions>
+ </storageModule>
+ <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.1015532240" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1015532240." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1201298107" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.2097807873" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
+ <builder buildPath="${workspace_loc:/server}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.857185882" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.142173353" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1657626940" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
+ <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.269939241" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1769920565" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.649991225" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../xmlrpc-c/include&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost&quot;"/>
+ </option>
+ <option id="gnu.cpp.compiler.option.preprocessor.def.2063944336" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
+ <listOptionValue builtIn="false" value="MAX_NUM_FACTORS=4"/>
+ <listOptionValue builtIn="false" value="WITH_THREADS"/>
+ <listOptionValue builtIn="false" value="KENLM_MAX_ORDER=7"/>
+ </option>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.603240279" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.165185265" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
+ <option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.502789927" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.debug.option.debugging.level.1365428538" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.836267531" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1867046221" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1443553047" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
+ <option id="gnu.cpp.link.option.paths.1096041402" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../xmlrpc-c/lib&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/search/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/OnDiskPt/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/lm/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
+ </option>
+ <option id="gnu.cpp.link.option.libs.1087215166" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+ <listOptionValue builtIn="false" value="moses"/>
+ <listOptionValue builtIn="false" value="search"/>
+ <listOptionValue builtIn="false" value="OnDiskPt"/>
+ <listOptionValue builtIn="false" value="lm"/>
+ <listOptionValue builtIn="false" value="util"/>
+ <listOptionValue builtIn="false" value="xmlrpc_server_abyss++"/>
+ <listOptionValue builtIn="false" value="xmlrpc_server++"/>
+ <listOptionValue builtIn="false" value="xmlrpc_server_abyss"/>
+ <listOptionValue builtIn="false" value="xmlrpc_server"/>
+ <listOptionValue builtIn="false" value="xmlrpc_abyss"/>
+ <listOptionValue builtIn="false" value="xmlrpc++ "/>
+ <listOptionValue builtIn="false" value="xmlrpc"/>
+ <listOptionValue builtIn="false" value="xmlrpc_util"/>
+ <listOptionValue builtIn="false" value="xmlrpc_xmlparse"/>
+ <listOptionValue builtIn="false" value="xmlrpc_xmltok"/>
+ <listOptionValue builtIn="false" value="pthread"/>
+ <listOptionValue builtIn="false" value="boost_serialization"/>
+ <listOptionValue builtIn="false" value="boost_iostreams"/>
+ <listOptionValue builtIn="false" value="boost_system"/>
+ <listOptionValue builtIn="false" value="boost_thread"/>
+ <listOptionValue builtIn="false" value="boost_filesystem"/>
+ <listOptionValue builtIn="false" value="z"/>
+ <listOptionValue builtIn="false" value="bz2"/>
+ <listOptionValue builtIn="false" value="dl"/>
+ <listOptionValue builtIn="false" value="rt"/>
+ </option>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.308755092" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+ <additionalInput kind="additionalinput" paths="$(LIBS)"/>
+ </inputType>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.784062133" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1514675611" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ </tool>
+ </toolChain>
+ </folderInfo>
+ </configuration>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+ </cconfiguration>
+ <cconfiguration id="cdt.managedbuild.config.gnu.exe.release.179761083">
+ <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.179761083" moduleId="org.eclipse.cdt.core.settings" name="Release">
+ <externalSettings/>
+ <extensions>
+ <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+ <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ <extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+ </extensions>
+ </storageModule>
+ <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+ <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.179761083" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+ <folderInfo id="cdt.managedbuild.config.gnu.exe.release.179761083." name="/" resourcePath="">
+ <toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.2024222442" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
+ <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1098252145" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
+ <builder buildPath="${workspace_loc:/server}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.24884855" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.archiver.base.1561001393" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1260095073" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
+ <option id="gnu.cpp.compiler.exe.release.option.optimization.level.824342210" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.exe.release.option.debugging.level.620231073" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.372465520" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1635883096" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
+ <option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.74859509" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
+ <option id="gnu.c.compiler.exe.release.option.debugging.level.1604502606" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+ <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.624155660" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.727800742" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
+ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.1586891175" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1588265513" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+ <additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+ <additionalInput kind="additionalinput" paths="$(LIBS)"/>
+ </inputType>
+ </tool>
+ <tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.727000276" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
+ <inputType id="cdt.managedbuild.tool.gnu.assembler.input.665044877" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+ </tool>
+ </toolChain>
+ </folderInfo>
+ </configuration>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+ </cconfiguration>
+ </storageModule>
+ <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+ <project id="server.cdt.managedbuild.target.gnu.exe.580879474" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
+ </storageModule>
+ <storageModule moduleId="scannerConfiguration">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.179761083;cdt.managedbuild.config.gnu.exe.release.179761083.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1635883096;cdt.managedbuild.tool.gnu.c.compiler.input.624155660">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.179761083;cdt.managedbuild.config.gnu.exe.release.179761083.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1260095073;cdt.managedbuild.tool.gnu.cpp.compiler.input.372465520">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1015532240;cdt.managedbuild.config.gnu.exe.debug.1015532240.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1657626940;cdt.managedbuild.tool.gnu.cpp.compiler.input.603240279">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ <scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.1015532240;cdt.managedbuild.config.gnu.exe.debug.1015532240.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.165185265;cdt.managedbuild.tool.gnu.c.compiler.input.836267531">
+ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+ </scannerConfigBuildInfo>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/server"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/server"/>
+ </configuration>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
+</cproject>
diff --git a/contrib/other-builds/server/.project b/contrib/other-builds/server/.project
new file mode 100644
index 000000000..fd9ccc290
--- /dev/null
+++ b/contrib/other-builds/server/.project
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>server</name>
+ <comment></comment>
+ <projects>
+ <project>lm</project>
+ <project>moses</project>
+ <project>OnDiskPt</project>
+ <project>search</project>
+ <project>util</project>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+ <triggers>clean,full,incremental,</triggers>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ <buildCommand>
+ <name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+ <triggers>full,incremental,</triggers>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.cdt.core.cnature</nature>
+ <nature>org.eclipse.cdt.core.ccnature</nature>
+ <nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+ <nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+ </natures>
+ <linkedResources>
+ <link>
+ <name>mosesserver.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-2-PROJECT_LOC/server/mosesserver.cpp</locationURI>
+ </link>
+ </linkedResources>
+</projectDescription>
diff --git a/contrib/rephraser/Jamfile b/contrib/rephraser/Jamfile
new file mode 100644
index 000000000..4d868ddf4
--- /dev/null
+++ b/contrib/rephraser/Jamfile
@@ -0,0 +1 @@
+exe paraphrase : paraphrase.cpp ../../moses//moses ../..//boost_program_options ;
diff --git a/contrib/rephraser/paraphrase.cpp b/contrib/rephraser/paraphrase.cpp
new file mode 100644
index 000000000..ad9dbc891
--- /dev/null
+++ b/contrib/rephraser/paraphrase.cpp
@@ -0,0 +1,148 @@
+// $Id$
+// vim:tabstop=2
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+
+/**
+ * Compute paraphrases from the phrase table
+**/
+#include <cmath>
+#include <iostream>
+#include <map>
+
+#include <boost/program_options.hpp>
+
+#include "util/double-conversion/double-conversion.h"
+#include "util/exception.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+//using namespace Moses;
+using namespace std;
+
+namespace po = boost::program_options;
+
+typedef multimap<float,string> Probs;
+
+static float threshold = 1e-04;
+static size_t maxE = 10000; //histogram pruning
+
+static void add(const string& e, const vector<float> scores,
+ Probs& p_e_given_f, Probs& p_f_given_e) {
+ if (scores[0] > threshold) {
+ p_f_given_e.insert(pair<float,string>(scores[0],e));
+ }
+ while(p_f_given_e.size() > maxE) p_f_given_e.erase(p_f_given_e.begin());
+ if (scores[2] > threshold) {
+ p_e_given_f.insert(pair<float,string>(scores[2],e));
+ }
+ while(p_e_given_f.size() > maxE) p_e_given_f.erase(p_e_given_f.begin());
+}
+
+static void finalise(Probs& p_e_given_f, Probs& p_f_given_e) {
+ //cerr << "Sizes: p(e|f): " << p_e_given_f.size() << " p(f|e): " << p_f_given_e.size() << endl;
+ for (Probs::const_iterator e1_iter = p_f_given_e.begin() ;
+ e1_iter != p_f_given_e.end(); ++e1_iter) {
+ for (Probs::const_iterator e2_iter = p_e_given_f.begin() ;
+ e2_iter != p_e_given_f.end(); ++e2_iter) {
+
+ if (e1_iter->second == e2_iter->second) continue;
+ cout << e1_iter->second << " ||| " << e2_iter->second << " ||| " <<
+ e1_iter->first * e2_iter->first << " ||| " << endl;
+ }
+ }
+ p_e_given_f.clear();
+ p_f_given_e.clear();
+}
+
+int main(int argc, char** argv) {
+
+ string input_file;
+
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("help,h", "Print help message and exit")
+ ("threshold,t", po::value<float>(&threshold), "Threshold for p(e|f) and p(f|e)")
+ ("max-target,m", po::value<size_t>(&maxE), "Maximum number of target phrases")
+ ("input-file", po::value<string>(&input_file)->required(), "Input phrase table")
+ ;
+
+ po::positional_options_description pos;
+ pos.add("input-file",1);
+
+ po::variables_map vm;
+ po::store(po::command_line_parser(argc,argv).options(desc).positional(pos).run(), vm);
+
+
+ if (vm.count("help")) {
+ cerr << "Usage: " << string(argv[0]) + " [options] input-file" << endl;
+ cerr << desc << endl;
+ return 0;
+ }
+
+ po::notify(vm);
+
+
+ cerr << "Reading from " << input_file << endl;
+ util::FilePiece in(input_file.c_str(), &std::cerr);
+ vector<float> scoreVector;
+ StringPiece line;
+ double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
+
+
+ string previousSourcePhrase;
+ Probs p_f_given_e_table;
+ Probs p_e_given_f_table;
+
+ size_t count = 0;
+ while(true) {
+ try {
+ line = in.ReadLine();
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
+ ++count;
+
+ util::TokenIter<util::MultiCharacter> pipes(line, " ||| ");
+ StringPiece sourcePhrase(*pipes);
+ StringPiece targetPhrase(*++pipes);
+ StringPiece scoreString(*++pipes);
+ scoreVector.clear();
+ for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+ int processed;
+ float score = converter.StringToFloat(s->data(), s->length(), &processed);
+ UTIL_THROW_IF2(isnan(score), "Bad score " << *s << " on line " << count);
+ scoreVector.push_back(score);
+ }
+
+ if (sourcePhrase.size() && sourcePhrase != previousSourcePhrase) {
+ finalise(p_e_given_f_table, p_f_given_e_table);
+ }
+ add(targetPhrase.as_string(),scoreVector, p_e_given_f_table, p_f_given_e_table);
+ previousSourcePhrase = sourcePhrase.as_string();
+ }
+ finalise(p_e_given_f_table, p_f_given_e_table);
+
+
+
+ return 0;
+}
diff --git a/contrib/rt/Empty.c b/contrib/rt/Empty.c
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/rt/Empty.c
diff --git a/contrib/rt/README b/contrib/rt/README
new file mode 100644
index 000000000..d7a4cfebc
--- /dev/null
+++ b/contrib/rt/README
@@ -0,0 +1,9 @@
+FOR OSX ONLY
+------------
+
+This creates an empty library file
+ librt.a
+It should be used when you are compile with Eclipse on OSX.
+
+The Eclipse projects are set up to link to librt but OSX doesn't have it so this just creates a dummy library.
+
diff --git a/contrib/rt/compile.sh b/contrib/rt/compile.sh
new file mode 100755
index 000000000..6266d58d6
--- /dev/null
+++ b/contrib/rt/compile.sh
@@ -0,0 +1,2 @@
+gcc -c Empty.c -o Empty.o
+ar rcs librt.a Empty.o \ No newline at end of file
diff --git a/contrib/server/Jamfile b/contrib/server/Jamfile
index 49770d548..a18a31cb0 100644
--- a/contrib/server/Jamfile
+++ b/contrib/server/Jamfile
@@ -5,6 +5,10 @@ import path ;
with-xmlrpc-c = [ option.get "with-xmlrpc-c" ] ;
if $(with-xmlrpc-c) {
+ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+ echo "!!! You are linking the XMLRPC-C library; Do NOT use v.1.25.29 !!!" ;
+ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+
build-moses-server = true ;
xmlrpc-command = $(with-xmlrpc-c)/bin/xmlrpc-c-config ;
if ! [ path.exists $(xmlrpc-command) ] {
@@ -35,7 +39,7 @@ if $(build-moses-server) = true
xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --libs" ] ;
xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --cflags" ] ;
- exe mosesserver : mosesserver.cpp ../../moses//moses ../../OnDiskPt//OnDiskPt ../../moses-cmd/IOWrapper.cpp : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
+ exe mosesserver : mosesserver.cpp ../../moses//moses ../../OnDiskPt//OnDiskPt ../..//boost_filesystem : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
} else {
alias mosesserver ;
}
diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index 0a7cd78df..a2e7b9183 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -4,10 +4,12 @@
#include <algorithm>
+#include "moses/Util.h"
#include "moses/ChartManager.h"
#include "moses/Hypothesis.h"
#include "moses/Manager.h"
#include "moses/StaticData.h"
+#include "moses/ThreadPool.h"
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#if PT_UG
@@ -15,7 +17,7 @@
#endif
#include "moses/TreeInput.h"
#include "moses/LM/ORLM.h"
-#include "moses-cmd/IOWrapper.h"
+#include "moses/IOWrapper.h"
#ifdef WITH_THREADS
#include <boost/thread.hpp>
@@ -26,7 +28,6 @@
#include <xmlrpc-c/server_abyss.hpp>
using namespace Moses;
-using namespace MosesCmd;
using namespace std;
typedef std::map<std::string, xmlrpc_c::value> params_t;
@@ -52,14 +53,14 @@ public:
#else
const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0];
PhraseDictionaryDynSuffixArray*
- pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
+ pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
cerr << "Inserting into address " << pdsa << endl;
pdsa->insertSnt(source_, target_, alignment_);
#endif
if(add2ORLM_) {
//updateORLM();
}
- cerr << "Done inserting\n";
+ XVERBOSE(1,"Done inserting\n");
//PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy);
map<string, xmlrpc_c::value> retData;
//*retvalP = xmlrpc_c::value_struct(retData);
@@ -72,65 +73,65 @@ public:
string source_, target_, alignment_;
bool bounded_, add2ORLM_;
/*
- void updateORLM() {
- // TODO(level101): this belongs in the language model, not in moseserver.cpp
- vector<string> vl;
- map<vector<string>, int> ngSet;
- LMList lms = StaticData::Instance().GetLMList(); // get LM
- LMList::const_iterator lmIter = lms.begin();
- LanguageModel *lm = *lmIter;
- LanguageModelORLM* orlm = static_cast<LanguageModelORLM*>(lm);
- if(orlm == 0) {
- cerr << "WARNING: Unable to add target sentence to ORLM\n";
- return;
- }
- // break out new ngrams from sentence
- const int ngOrder(orlm->GetNGramOrder());
- const std::string sBOS = orlm->GetSentenceStart()->GetString().as_string();
- const std::string sEOS = orlm->GetSentenceEnd()->GetString().as_string();
- Utils::splitToStr(target_, vl, " ");
- // insert BOS and EOS
- vl.insert(vl.begin(), sBOS);
- vl.insert(vl.end(), sEOS);
- for(int j=0; j < vl.size(); ++j) {
- int i = (j<ngOrder) ? 0 : j-ngOrder+1;
- for(int t=j; t >= i; --t) {
- vector<string> ngVec;
- for(int s=t; s<=j; ++s) {
- ngVec.push_back(vl[s]);
- //cerr << vl[s] << " ";
- }
- ngSet[ngVec]++;
- //cerr << endl;
- }
- }
- // insert into LM in order from 1grams up (for LM well-formedness)
- cerr << "Inserting " << ngSet.size() << " ngrams into ORLM...\n";
- for(int i=1; i <= ngOrder; ++i) {
- iterate(ngSet, it) {
- if(it->first.size() == i)
- orlm->UpdateORLM(it->first, it->second);
- }
- }
- }
- */
+ void updateORLM() {
+ // TODO(level101): this belongs in the language model, not in moseserver.cpp
+ vector<string> vl;
+ map<vector<string>, int> ngSet;
+ LMList lms = StaticData::Instance().GetLMList(); // get LM
+ LMList::const_iterator lmIter = lms.begin();
+ LanguageModel *lm = *lmIter;
+ LanguageModelORLM* orlm = static_cast<LanguageModelORLM*>(lm);
+ if(orlm == 0) {
+ cerr << "WARNING: Unable to add target sentence to ORLM\n";
+ return;
+ }
+ // break out new ngrams from sentence
+ const int ngOrder(orlm->GetNGramOrder());
+ const std::string sBOS = orlm->GetSentenceStart()->GetString().as_string();
+ const std::string sEOS = orlm->GetSentenceEnd()->GetString().as_string();
+ Utils::splitToStr(target_, vl, " ");
+ // insert BOS and EOS
+ vl.insert(vl.begin(), sBOS);
+ vl.insert(vl.end(), sEOS);
+ for(int j=0; j < vl.size(); ++j) {
+ int i = (j<ngOrder) ? 0 : j-ngOrder+1;
+ for(int t=j; t >= i; --t) {
+ vector<string> ngVec;
+ for(int s=t; s<=j; ++s) {
+ ngVec.push_back(vl[s]);
+ //cerr << vl[s] << " ";
+ }
+ ngSet[ngVec]++;
+ //cerr << endl;
+ }
+ }
+ // insert into LM in order from 1grams up (for LM well-formedness)
+ cerr << "Inserting " << ngSet.size() << " ngrams into ORLM...\n";
+ for(int i=1; i <= ngOrder; ++i) {
+ iterate(ngSet, it) {
+ if(it->first.size() == i)
+ orlm->UpdateORLM(it->first, it->second);
+ }
+ }
+ }
+ */
void breakOutParams(const params_t& params) {
params_t::const_iterator si = params.find("source");
if(si == params.end())
throw xmlrpc_c::fault("Missing source sentence", xmlrpc_c::fault::CODE_PARSE);
source_ = xmlrpc_c::value_string(si->second);
- cerr << "source = " << source_ << endl;
+ XVERBOSE(1,"source = " << source_ << endl);
si = params.find("target");
if(si == params.end())
throw xmlrpc_c::fault("Missing target sentence", xmlrpc_c::fault::CODE_PARSE);
target_ = xmlrpc_c::value_string(si->second);
- cerr << "target = " << target_ << endl;
+ XVERBOSE(1,"target = " << target_ << endl);
si = params.find("alignment");
if(si == params.end())
throw xmlrpc_c::fault("Missing alignment", xmlrpc_c::fault::CODE_PARSE);
alignment_ = xmlrpc_c::value_string(si->second);
- cerr << "alignment = " << alignment_ << endl;
+ XVERBOSE(1,"alignment = " << alignment_ << endl);
si = params.find("bounded");
bounded_ = (si != params.end());
si = params.find("updateORLM");
@@ -148,7 +149,7 @@ public:
this->_signature = "S:S";
this->_help = "Optimizes multi-model translation model";
}
-
+
void
execute(xmlrpc_c::paramList const& paramList,
xmlrpc_c::value * const retvalP) {
@@ -157,37 +158,37 @@ public:
params_t::const_iterator si = params.find("model_name");
if (si == params.end()) {
throw xmlrpc_c::fault(
- "Missing name of model to be optimized (e.g. PhraseDictionaryMultiModelCounts0)",
- xmlrpc_c::fault::CODE_PARSE);
+ "Missing name of model to be optimized (e.g. PhraseDictionaryMultiModelCounts0)",
+ xmlrpc_c::fault::CODE_PARSE);
}
const string model_name = xmlrpc_c::value_string(si->second);
PhraseDictionaryMultiModel* pdmm = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
-
+
si = params.find("phrase_pairs");
if (si == params.end()) {
throw xmlrpc_c::fault(
- "Missing list of phrase pairs",
- xmlrpc_c::fault::CODE_PARSE);
+ "Missing list of phrase pairs",
+ xmlrpc_c::fault::CODE_PARSE);
}
-
+
vector<pair<string, string> > phrase_pairs;
-
+
xmlrpc_c::value_array phrase_pairs_array = xmlrpc_c::value_array(si->second);
vector<xmlrpc_c::value> phrasePairValueVector(phrase_pairs_array.vectorValueValue());
for (size_t i=0;i < phrasePairValueVector.size();i++) {
- xmlrpc_c::value_array phrasePairArray = xmlrpc_c::value_array(phrasePairValueVector[i]);
- vector<xmlrpc_c::value> phrasePair(phrasePairArray.vectorValueValue());
- string L1 = xmlrpc_c::value_string(phrasePair[0]);
- string L2 = xmlrpc_c::value_string(phrasePair[1]);
- phrase_pairs.push_back(make_pair(L1,L2));
+ xmlrpc_c::value_array phrasePairArray = xmlrpc_c::value_array(phrasePairValueVector[i]);
+ vector<xmlrpc_c::value> phrasePair(phrasePairArray.vectorValueValue());
+ string L1 = xmlrpc_c::value_string(phrasePair[0]);
+ string L2 = xmlrpc_c::value_string(phrasePair[1]);
+ phrase_pairs.push_back(make_pair(L1,L2));
}
-
+
vector<float> weight_vector;
weight_vector = pdmm->MinimizePerplexity(phrase_pairs);
-
+
vector<xmlrpc_c::value> weight_vector_ret;
for (size_t i=0;i < weight_vector.size();i++) {
- weight_vector_ret.push_back(xmlrpc_c::value_double(weight_vector[i]));
+ weight_vector_ret.push_back(xmlrpc_c::value_double(weight_vector[i]));
}
*retvalP = xmlrpc_c::value_array(weight_vector_ret);
#else
@@ -198,33 +199,38 @@ public:
}
};
-
-class Translator : public xmlrpc_c::method
-{
+/**
+ * Required so that translations can be sent to a thread pool.
+**/
+class TranslationTask : public virtual Moses::Task {
public:
- Translator() {
- // signature and help strings are documentation -- the client
- // can query this information with a system.methodSignature and
- // system.methodHelp RPC.
- this->_signature = "S:S";
- this->_help = "Does translation";
- }
-
- void
- execute(xmlrpc_c::paramList const& paramList,
- xmlrpc_c::value * const retvalP) {
-
- const params_t params = paramList.getStruct(0);
- paramList.verifyEnd(1);
+ TranslationTask(xmlrpc_c::paramList const& paramList,
+ boost::condition_variable& cond, boost::mutex& mut)
+ : m_paramList(paramList),
+ m_cond(cond),
+ m_mut(mut),
+ m_done(false)
+ {}
+
+ virtual bool DeleteAfterExecution() {return false;}
+
+ bool IsDone() const {return m_done;}
+
+ const map<string, xmlrpc_c::value>& GetRetData() { return m_retData;}
+
+ virtual void Run() {
+
+ const params_t params = m_paramList.getStruct(0);
+ m_paramList.verifyEnd(1);
params_t::const_iterator si = params.find("text");
if (si == params.end()) {
throw xmlrpc_c::fault(
- "Missing source text",
- xmlrpc_c::fault::CODE_PARSE);
+ "Missing source text",
+ xmlrpc_c::fault::CODE_PARSE);
}
const string source((xmlrpc_c::value_string(si->second)));
-
- cerr << "Input: " << source << endl;
+
+ XVERBOSE(1,"Input: " << source << endl);
si = params.find("align");
bool addAlignInfo = (si != params.end());
si = params.find("word-align");
@@ -233,105 +239,80 @@ public:
bool addGraphInfo = (si != params.end());
si = params.find("topt");
bool addTopts = (si != params.end());
- si = params.find("report-segmentation");
- bool reportSegmentation = (si != params.end());
si = params.find("report-all-factors");
bool reportAllFactors = (si != params.end());
si = params.find("nbest");
int nbest_size = (si == params.end()) ? 0 : int(xmlrpc_c::value_int(si->second));
si = params.find("nbest-distinct");
bool nbest_distinct = (si != params.end());
+
si = params.find("add-score-breakdown");
bool addScoreBreakdown = (si != params.end());
-
+
vector<float> multiModelWeights;
si = params.find("lambda");
if (si != params.end()) {
- xmlrpc_c::value_array multiModelArray = xmlrpc_c::value_array(si->second);
- vector<xmlrpc_c::value> multiModelValueVector(multiModelArray.vectorValueValue());
- for (size_t i=0;i < multiModelValueVector.size();i++) {
- multiModelWeights.push_back(xmlrpc_c::value_double(multiModelValueVector[i]));
- }
+ xmlrpc_c::value_array multiModelArray = xmlrpc_c::value_array(si->second);
+ vector<xmlrpc_c::value> multiModelValueVector(multiModelArray.vectorValueValue());
+ for (size_t i=0;i < multiModelValueVector.size();i++) {
+ multiModelWeights.push_back(xmlrpc_c::value_double(multiModelValueVector[i]));
+ }
}
-
+
si = params.find("model_name");
if (si != params.end() && multiModelWeights.size() > 0) {
- const string model_name = xmlrpc_c::value_string(si->second);
- PhraseDictionaryMultiModel* pdmm = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
- pdmm->SetTemporaryMultiModelWeightsVector(multiModelWeights);
+ const string model_name = xmlrpc_c::value_string(si->second);
+ PhraseDictionaryMultiModel* pdmm = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
+ pdmm->SetTemporaryMultiModelWeightsVector(multiModelWeights);
}
-
+
const StaticData &staticData = StaticData::Instance();
-
- if (addGraphInfo) {
+
+ //Make sure alternative paths are retained, if necessary
+ if (addGraphInfo || nbest_size>0) {
(const_cast<StaticData&>(staticData)).SetOutputSearchGraph(true);
}
-
+
+
stringstream out, graphInfo, transCollOpts;
- map<string, xmlrpc_c::value> retData;
-
+
if (staticData.IsChart()) {
- TreeInput tinput;
- const vector<FactorType>& inputFactorOrder = staticData.GetInputFactorOrder();
- stringstream in(source + "\n");
- tinput.Read(in,inputFactorOrder);
- ChartManager manager(tinput);
- manager.ProcessSentence();
-
- const ChartHypothesis* bestHypo = NULL;
- bestHypo = manager.GetBestHypothesis();
-
- if (bestHypo) {
- outputChartHypo(out,bestHypo);
-
- const size_t translationId = tinput.GetTranslationId();
- if (staticData.IsIDEnabled()) {
- retData.insert(pair<string, xmlrpc_c::value_int>("id", translationId));
- }
-
- if (addGraphInfo) {
- std::ostringstream sgstream;
- manager.GetSearchGraph(translationId,sgstream);
- retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_string(sgstream.str())));
- }
- }
- else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
- }
- } else {
- Sentence sentence;
- const vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder();
- stringstream in(source + "\n");
- sentence.Read(in,inputFactorOrder);
- const size_t translationId = sentence.GetTranslationId();
- size_t lineNumber = 0; // TODO: Include sentence request number here?
- Manager manager(lineNumber, sentence, staticData.GetSearchAlgorithm());
- manager.ProcessSentence();
- const Hypothesis* bestHypo = NULL;
- bestHypo = manager.GetBestHypothesis();
- if (bestHypo) {
- VERBOSE(0, "HERE I AM:" << staticData.GetOutputHypoScore() << endl);
- if (staticData.GetOutputHypoScore()) {
- retData.insert(pair<string, xmlrpc_c::value_double>("totalScore", bestHypo->GetTotalScore()));
- VERBOSE(0, "TOTALSCORE:" << bestHypo->GetTotalScore() << endl);
- }
-
- if (staticData.IsIDEnabled()) {
- retData.insert(pair<string, xmlrpc_c::value_int>("id", translationId));
+ TreeInput tinput;
+ const vector<FactorType>&
+ inputFactorOrder = staticData.GetInputFactorOrder();
+ stringstream in(source + "\n");
+ tinput.Read(in,inputFactorOrder);
+ ChartManager manager(tinput);
+ manager.Decode();
+ const ChartHypothesis *hypo = manager.GetBestHypothesis();
+ outputChartHypo(out,hypo);
+ if (addGraphInfo) {
+ // const size_t translationId = tinput.GetTranslationId();
+ std::ostringstream sgstream;
+ manager.OutputSearchGraphMoses(sgstream);
+ m_retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_string(sgstream.str())));
}
-
- if (staticData.IsPassthroughEnabled()) {
- OutputPassthroughInformation(out, bestHypo);
- }
-
+ } else {
+ size_t lineNumber = 0; // TODO: Include sentence request number here?
+ Sentence sentence;
+ sentence.SetTranslationId(lineNumber);
+
+ const vector<FactorType> &
+ inputFactorOrder = staticData.GetInputFactorOrder();
+ stringstream in(source + "\n");
+ sentence.Read(in,inputFactorOrder);
+ Manager manager(sentence, staticData.GetSearchAlgorithm());
+ manager.Decode();
+ const Hypothesis* hypo = manager.GetBestHypothesis();
+
vector<xmlrpc_c::value> alignInfo;
- outputHypo(out,bestHypo,addAlignInfo,alignInfo,reportSegmentation,reportAllFactors);
+ outputHypo(out,hypo,addAlignInfo,alignInfo,reportAllFactors);
if (addAlignInfo) {
- retData.insert(pair<string, xmlrpc_c::value>("align", xmlrpc_c::value_array(alignInfo)));
+ m_retData.insert(pair<string, xmlrpc_c::value>("align", xmlrpc_c::value_array(alignInfo)));
}
if (addWordAlignInfo) {
stringstream wordAlignment;
- OutputAlignment(wordAlignment, bestHypo);
+ IOWrapper::OutputAlignment(wordAlignment, hypo);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair) {
@@ -341,383 +322,399 @@ public:
wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
}
- retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
+ m_retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
}
-
+
if (addGraphInfo) {
- insertGraphInfo(manager,retData);
- (const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
+ insertGraphInfo(manager,m_retData);
}
if (addTopts) {
- insertTranslationOptions(manager,retData);
+ insertTranslationOptions(manager,m_retData);
}
if (nbest_size>0) {
- VERBOSE(0, "NBEST:" << nbest_size << endl);
- outputNBest(manager, retData, nbest_size, nbest_distinct,
- reportSegmentation,reportAllFactors, addAlignInfo, addScoreBreakdown);
+ outputNBest(manager, m_retData, nbest_size, nbest_distinct,
+ reportAllFactors, addAlignInfo, addScoreBreakdown);
}
-
- pair<string, xmlrpc_c::value>
- text("text", xmlrpc_c::value_string(out.str()));
- retData.insert(text);
- cerr << "Output: " << out.str() << endl;
- *retvalP = xmlrpc_c::value_struct(retData);
- }
- else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
- }
- }
- }
-
- void outputHypo(ostream& out, const Hypothesis* hypo, bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo, bool reportSegmentation = false, bool reportAllFactors = false) {
- if (hypo->GetPrevHypo() != NULL) {
- outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo, alignInfo, reportSegmentation, reportAllFactors);
- Phrase p = hypo->GetCurrTargetPhrase();
- if(reportAllFactors) {
- out << p << " ";
- } else {
- for (size_t pos = 0 ; pos < p.GetSize() ; pos++) {
- const Factor *factor = p.GetFactor(pos, 0);
- out << *factor << " ";
- }
- }
- //phrase-to-phrase segmentation
- // trace ("report segmentation") option "-t"
- // enriched segmentation ("-tt") is not reported
- if (reportSegmentation > 0 && p.GetSize() > 0) {
- const WordsRange &sourceRange = hypo->GetCurrSourceWordsRange();
- const int sourceStart = sourceRange.GetStartPos();
- const int sourceEnd = sourceRange.GetEndPos();
- out << "|" << sourceStart << "-" << sourceEnd << "| ";
- }
-
- if (addAlignmentInfo) {
- /**
- * Add the alignment info to the array. This is in target
- * order and consists of (tgt-start, src-start, src-end)
- * triples.
- **/
- map<string, xmlrpc_c::value> phraseAlignInfo;
- phraseAlignInfo["tgt-start"] = xmlrpc_c::value_int(hypo->GetCurrTargetWordsRange().GetStartPos());
- phraseAlignInfo["src-start"] = xmlrpc_c::value_int(hypo->GetCurrSourceWordsRange().GetStartPos());
- phraseAlignInfo["src-end"] = xmlrpc_c::value_int(hypo->GetCurrSourceWordsRange().GetEndPos());
- alignInfo.push_back(xmlrpc_c::value_struct(phraseAlignInfo));
- }
- }
- }
-
- void outputChartHypo(ostream& out, const ChartHypothesis* hypo) {
- Phrase outPhrase(20);
- hypo->GetOutputPhrase(outPhrase);
-
- // delete 1st & last
- assert(outPhrase.GetSize() >= 2);
- outPhrase.RemoveWord(0);
- outPhrase.RemoveWord(outPhrase.GetSize() - 1);
- for (size_t pos = 0 ; pos < outPhrase.GetSize() ; pos++) {
- const Factor *factor = outPhrase.GetFactor(pos, 0);
- out << *factor << " ";
- }
-
- }
-
- bool compareSearchGraphNode(const SearchGraphNode& a, const SearchGraphNode b) {
- return a.hypo->GetId() < b.hypo->GetId();
- }
-
- void insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData) {
- vector<xmlrpc_c::value> searchGraphXml;
- vector<SearchGraphNode> searchGraph;
- manager.GetSearchGraph(searchGraph);
- std::sort(searchGraph.begin(), searchGraph.end());
- for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin(); i != searchGraph.end(); ++i) {
- map<string, xmlrpc_c::value> searchGraphXmlNode;
- searchGraphXmlNode["forward"] = xmlrpc_c::value_double(i->forward);
- searchGraphXmlNode["fscore"] = xmlrpc_c::value_double(i->fscore);
- const Hypothesis* hypo = i->hypo;
- searchGraphXmlNode["hyp"] = xmlrpc_c::value_int(hypo->GetId());
- searchGraphXmlNode["stack"] = xmlrpc_c::value_int(hypo->GetWordsBitmap().GetNumWordsCovered());
- if (hypo->GetId() != 0) {
- const Hypothesis *prevHypo = hypo->GetPrevHypo();
- searchGraphXmlNode["back"] = xmlrpc_c::value_int(prevHypo->GetId());
- searchGraphXmlNode["score"] = xmlrpc_c::value_double(hypo->GetScore());
- searchGraphXmlNode["transition"] = xmlrpc_c::value_double(hypo->GetScore() - prevHypo->GetScore());
- if (i->recombinationHypo) {
- searchGraphXmlNode["recombined"] = xmlrpc_c::value_int(i->recombinationHypo->GetId());
- }
- searchGraphXmlNode["cover-start"] = xmlrpc_c::value_int(hypo->GetCurrSourceWordsRange().GetStartPos());
- searchGraphXmlNode["cover-end"] = xmlrpc_c::value_int(hypo->GetCurrSourceWordsRange().GetEndPos());
- searchGraphXmlNode["out"] =
- xmlrpc_c::value_string(hypo->GetCurrTargetPhrase().GetStringRep(StaticData::Instance().GetOutputFactorOrder()));
- }
- searchGraphXml.push_back(xmlrpc_c::value_struct(searchGraphXmlNode));
- }
- retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_array(searchGraphXml)));
- }
-
- void outputNBest(const Manager& manager,
- map<string, xmlrpc_c::value>& retData,
- const int n=100,
- const bool distinct=false,
- const bool reportSegmentation=false,
- const bool reportAllFactors=false,
- const bool addAlignmentInfo=false,
- const bool addScoreBreakdown=false)
+ (const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
+
+ }
+ pair<string, xmlrpc_c::value>
+ text("text", xmlrpc_c::value_string(out.str()));
+ m_retData.insert(text);
+ XVERBOSE(1,"Output: " << out.str() << endl);
+ {
+ boost::lock_guard<boost::mutex> lock(m_mut);
+ m_done = true;
+ }
+ m_cond.notify_one();
+
+ }
+
+ void outputHypo(ostream& out, const Hypothesis* hypo, bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo, bool reportAllFactors = false) {
+ if (hypo->GetPrevHypo() != NULL) {
+ outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo, alignInfo, reportAllFactors);
+ Phrase p = hypo->GetCurrTargetPhrase();
+ if(reportAllFactors) {
+ out << p << " ";
+ } else {
+ for (size_t pos = 0 ; pos < p.GetSize() ; pos++) {
+ const Factor *factor = p.GetFactor(pos, 0);
+ out << *factor << " ";
+ }
+ }
+
+ if (addAlignmentInfo) {
+ /**
+ * Add the alignment info to the array. This is in target
+ * order and consists of (tgt-start, src-start, src-end)
+ * triples.
+ **/
+ map<string, xmlrpc_c::value> phraseAlignInfo;
+ phraseAlignInfo["tgt-start"] = xmlrpc_c::value_int(hypo->GetCurrTargetWordsRange().GetStartPos());
+ phraseAlignInfo["src-start"] = xmlrpc_c::value_int(hypo->GetCurrSourceWordsRange().GetStartPos());
+ phraseAlignInfo["src-end"] = xmlrpc_c::value_int(hypo->GetCurrSourceWordsRange().GetEndPos());
+ alignInfo.push_back(xmlrpc_c::value_struct(phraseAlignInfo));
+ }
+ }
+ }
+
+ void outputChartHypo(ostream& out, const ChartHypothesis* hypo) {
+ Phrase outPhrase(20);
+ hypo->GetOutputPhrase(outPhrase);
+
+ // delete 1st & last
+ assert(outPhrase.GetSize() >= 2);
+ outPhrase.RemoveWord(0);
+ outPhrase.RemoveWord(outPhrase.GetSize() - 1);
+ for (size_t pos = 0 ; pos < outPhrase.GetSize() ; pos++) {
+ const Factor *factor = outPhrase.GetFactor(pos, 0);
+ out << *factor << " ";
+ }
+
+ }
+
+ bool compareSearchGraphNode(const SearchGraphNode& a, const SearchGraphNode b) {
+ return a.hypo->GetId() < b.hypo->GetId();
+ }
+
+ void insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData) {
+ vector<xmlrpc_c::value> searchGraphXml;
+ vector<SearchGraphNode> searchGraph;
+ manager.GetSearchGraph(searchGraph);
+ std::sort(searchGraph.begin(), searchGraph.end());
+ for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin(); i != searchGraph.end(); ++i) {
+ map<string, xmlrpc_c::value> searchGraphXmlNode;
+ searchGraphXmlNode["forward"] = xmlrpc_c::value_double(i->forward);
+ searchGraphXmlNode["fscore"] = xmlrpc_c::value_double(i->fscore);
+ const Hypothesis* hypo = i->hypo;
+ searchGraphXmlNode["hyp"] = xmlrpc_c::value_int(hypo->GetId());
+ searchGraphXmlNode["stack"] = xmlrpc_c::value_int(hypo->GetWordsBitmap().GetNumWordsCovered());
+ if (hypo->GetId() != 0) {
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
+ searchGraphXmlNode["back"] = xmlrpc_c::value_int(prevHypo->GetId());
+ searchGraphXmlNode["score"] = xmlrpc_c::value_double(hypo->GetScore());
+ searchGraphXmlNode["transition"] = xmlrpc_c::value_double(hypo->GetScore() - prevHypo->GetScore());
+ if (i->recombinationHypo) {
+ searchGraphXmlNode["recombined"] = xmlrpc_c::value_int(i->recombinationHypo->GetId());
+ }
+ searchGraphXmlNode["cover-start"] = xmlrpc_c::value_int(hypo->GetCurrSourceWordsRange().GetStartPos());
+ searchGraphXmlNode["cover-end"] = xmlrpc_c::value_int(hypo->GetCurrSourceWordsRange().GetEndPos());
+ searchGraphXmlNode["out"] =
+ xmlrpc_c::value_string(hypo->GetCurrTargetPhrase().GetStringRep(StaticData::Instance().GetOutputFactorOrder()));
+ }
+ searchGraphXml.push_back(xmlrpc_c::value_struct(searchGraphXmlNode));
+ }
+ retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_array(searchGraphXml)));
+ }
+
+ void outputNBest(const Manager& manager,
+ map<string, xmlrpc_c::value>& retData,
+ const int n=100,
+ const bool distinct=false,
+ const bool reportAllFactors=false,
+ const bool addAlignmentInfo=false,
+ const bool addScoreBreakdown=false)
+ {
+ TrellisPathList nBestList;
+ manager.CalcNBest(n, nBestList, distinct);
+
+ vector<xmlrpc_c::value> nBestXml;
+ TrellisPathList::const_iterator iter;
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
+ const TrellisPath &path = **iter;
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
+ map<string, xmlrpc_c::value> nBestXMLItem;
+
+ // output surface
+ ostringstream out;
+ vector<xmlrpc_c::value> alignInfo;
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ const Phrase& phrase = edge.GetCurrTargetPhrase();
+ if(reportAllFactors) {
+ out << phrase << " ";
+ } else {
+ for (size_t pos = 0 ; pos < phrase.GetSize() ; pos++) {
+ const Factor *factor = phrase.GetFactor(pos, 0);
+ out << *factor << " ";
+ }
+ }
+
+ if (addAlignmentInfo && currEdge != (int)edges.size() - 1) {
+ map<string, xmlrpc_c::value> phraseAlignInfo;
+ phraseAlignInfo["tgt-start"] = xmlrpc_c::value_int(edge.GetCurrTargetWordsRange().GetStartPos());
+ phraseAlignInfo["src-start"] = xmlrpc_c::value_int(edge.GetCurrSourceWordsRange().GetStartPos());
+ phraseAlignInfo["src-end"] = xmlrpc_c::value_int(edge.GetCurrSourceWordsRange().GetEndPos());
+ alignInfo.push_back(xmlrpc_c::value_struct(phraseAlignInfo));
+ }
+ }
+ nBestXMLItem["hyp"] = xmlrpc_c::value_string(out.str());
+
+ if (addAlignmentInfo) {
+ nBestXMLItem["align"] = xmlrpc_c::value_array(alignInfo);
+
+ if ((int)edges.size() > 0) {
+ stringstream wordAlignment;
+ IOWrapper::OutputAlignment(wordAlignment, edges[0]);
+ vector<xmlrpc_c::value> alignments;
+ string alignmentPair;
+ while (wordAlignment >> alignmentPair) {
+ int pos = alignmentPair.find('-');
+ map<string, xmlrpc_c::value> wordAlignInfo;
+ wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str()));
+ wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
+ alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
+ }
+ nBestXMLItem["word-align"] = xmlrpc_c::value_array(alignments);
+ }
+ }
+
+ if (addScoreBreakdown)
{
- TrellisPathList nBestList;
- manager.CalcNBest(n, nBestList, distinct);
-
- VERBOSE(0, "NBEST:" << endl);
- VERBOSE(0, "DISTINCT:" << distinct << endl);
-
- vector<xmlrpc_c::value> nBestXml;
- TrellisPathList::const_iterator iter;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- VERBOSE(0, "nbest " << endl);
- const TrellisPath &path = **iter;
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
- map<string, xmlrpc_c::value> nBestXMLItem;
-
- // output surface
- ostringstream out;
- vector<xmlrpc_c::value> alignInfo;
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const Phrase& phrase = edge.GetCurrTargetPhrase();
- if(reportAllFactors) {
- out << phrase << " ";
- } else {
- for (size_t pos = 0 ; pos < phrase.GetSize() ; pos++) {
- const Factor *factor = phrase.GetFactor(pos, 0);
- out << *factor << " ";
- }
- }
-
- //phrase-to-phrase segmentation
- // trace ("report segmentation") option "-t"
- // enriched segmentation ("-tt") is not reported
- if (reportSegmentation > 0 && phrase.GetSize() > 0) {
- const WordsRange &sourceRange = edge.
-
- GetCurrSourceWordsRange();
- const int sourceStart = sourceRange.GetStartPos();
- const int sourceEnd = sourceRange.GetEndPos();
- out << "|" << sourceStart << "-" << sourceEnd << "| ";
- }
-
-
- if (addAlignmentInfo && currEdge != (int)edges.size() - 1) {
- map<string, xmlrpc_c::value> phraseAlignInfo;
- phraseAlignInfo["tgt-start"] = xmlrpc_c::value_int(edge.GetCurrTargetWordsRange().GetStartPos());
- phraseAlignInfo["src-start"] = xmlrpc_c::value_int(edge.GetCurrSourceWordsRange().GetStartPos());
- phraseAlignInfo["src-end"] = xmlrpc_c::value_int(edge.GetCurrSourceWordsRange().GetEndPos());
- alignInfo.push_back(xmlrpc_c::value_struct(phraseAlignInfo));
- }
- }
- nBestXMLItem["hyp"] = xmlrpc_c::value_string(out.str());
-
- if (addAlignmentInfo) {
- nBestXMLItem["align"] = xmlrpc_c::value_array(alignInfo);
-
- if ((int)edges.size() > 0) {
- stringstream wordAlignment;
- OutputAlignment(wordAlignment, edges[0]);
- vector<xmlrpc_c::value> alignments;
- string alignmentPair;
- while (wordAlignment >> alignmentPair) {
- int pos = alignmentPair.find('-');
- map<string, xmlrpc_c::value> wordAlignInfo;
- wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str()));
- wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
- alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
- }
- nBestXMLItem["word-align"] = xmlrpc_c::value_array(alignments);
- }
- }
-
- if (addScoreBreakdown)
- {
- // should the score breakdown be reported in a more structured manner?
- ostringstream buf;
- MosesCmd::OutputAllFeatureScores(path.GetScoreBreakdown(),buf);
- nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
- }
-
- // weighted score
- nBestXMLItem["totalScore"] = xmlrpc_c::value_double(path.GetTotalScore());
- nBestXml.push_back(xmlrpc_c::value_struct(nBestXMLItem));
- }
- retData.insert(pair<string, xmlrpc_c::value>("nbest", xmlrpc_c::value_array(nBestXml)));
- }
-
- void insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData) {
- const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
- vector<xmlrpc_c::value> toptsXml;
- for (size_t startPos = 0 ; startPos < toptsColl->GetSource().GetSize() ; ++startPos) {
- size_t maxSize = toptsColl->GetSource().GetSize() - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- WordsRange range(startPos,endPos);
- const TranslationOptionList& fullList = toptsColl->GetTranslationOptionList(range);
- for (size_t i = 0; i < fullList.size(); i++) {
- const TranslationOption* topt = fullList.Get(i);
- map<string, xmlrpc_c::value> toptXml;
- toptXml["phrase"] = xmlrpc_c::value_string(topt->GetTargetPhrase().
- GetStringRep(StaticData::Instance().GetOutputFactorOrder()));
- toptXml["fscore"] = xmlrpc_c::value_double(topt->GetFutureScore());
- toptXml["start"] = xmlrpc_c::value_int(startPos);
- toptXml["end"] = xmlrpc_c::value_int(endPos);
- vector<xmlrpc_c::value> scoresXml;
- const std::valarray<FValue> &scores = topt->GetScoreBreakdown().getCoreFeatures();
- for (size_t j = 0; j < scores.size(); ++j) {
- scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
- }
- toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
- toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
- }
- }
- }
- retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml)));
+ // should the score breakdown be reported in a more structured manner?
+ ostringstream buf;
+ IOWrapper::OutputAllFeatureScores(path.GetScoreBreakdown(),buf);
+ nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
}
+
+ // weighted score
+ nBestXMLItem["totalScore"] = xmlrpc_c::value_double(path.GetTotalScore());
+ nBestXml.push_back(xmlrpc_c::value_struct(nBestXMLItem));
+ }
+ retData.insert(pair<string, xmlrpc_c::value>("nbest", xmlrpc_c::value_array(nBestXml)));
+ }
+
+ void insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData) {
+ const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
+ vector<xmlrpc_c::value> toptsXml;
+ for (size_t startPos = 0 ; startPos < toptsColl->GetSource().GetSize() ; ++startPos) {
+ size_t maxSize = toptsColl->GetSource().GetSize() - startPos;
+ size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
+ maxSize = std::min(maxSize, maxSizePhrase);
+
+ for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
+ WordsRange range(startPos,endPos);
+ const TranslationOptionList& fullList = toptsColl->GetTranslationOptionList(range);
+ for (size_t i = 0; i < fullList.size(); i++) {
+ const TranslationOption* topt = fullList.Get(i);
+ map<string, xmlrpc_c::value> toptXml;
+ toptXml["phrase"] = xmlrpc_c::value_string(topt->GetTargetPhrase().
+ GetStringRep(StaticData::Instance().GetOutputFactorOrder()));
+ toptXml["fscore"] = xmlrpc_c::value_double(topt->GetFutureScore());
+ toptXml["start"] = xmlrpc_c::value_int(startPos);
+ toptXml["end"] = xmlrpc_c::value_int(endPos);
+ vector<xmlrpc_c::value> scoresXml;
+ const std::valarray<FValue> &scores = topt->GetScoreBreakdown().getCoreFeatures();
+ for (size_t j = 0; j < scores.size(); ++j) {
+ scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
+ }
+ toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
+ toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
+ }
+ }
+ }
+ retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml)));
+
+ }
+
+private:
+ xmlrpc_c::paramList const& m_paramList;
+ map<string, xmlrpc_c::value> m_retData;
+ boost::condition_variable& m_cond;
+ boost::mutex& m_mut;
+ bool m_done;
+};
+
+class Translator : public xmlrpc_c::method
+{
+public:
+ Translator(size_t numThreads = 10) : m_threadPool(numThreads) {
+ // signature and help strings are documentation -- the client
+ // can query this information with a system.methodSignature and
+ // system.methodHelp RPC.
+ this->_signature = "S:S";
+ this->_help = "Does translation";
+ }
+
+ void
+ execute(xmlrpc_c::paramList const& paramList,
+ xmlrpc_c::value * const retvalP) {
+ boost::condition_variable cond;
+ boost::mutex mut;
+ TranslationTask task(paramList,cond,mut);
+ m_threadPool.Submit(&task);
+ boost::unique_lock<boost::mutex> lock(mut);
+ while (!task.IsDone()) {
+ cond.wait(lock);
+ }
+ *retvalP = xmlrpc_c::value_struct(task.GetRetData());
+ }
+private:
+ Moses::ThreadPool m_threadPool;
};
static
void
PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
{
- out << ff->GetScoreProducerDescription() << "=";
- size_t numScoreComps = ff->GetNumScoreComponents();
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- for (size_t i = 0; i < numScoreComps; ++i) {
- out << " " << values[i];
- }
- out << endl;
+ out << ff->GetScoreProducerDescription() << "=";
+ size_t numScoreComps = ff->GetNumScoreComponents();
+ vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+ for (size_t i = 0; i < numScoreComps; ++i) {
+ out << " " << values[i];
+ }
+ out << endl;
}
static
void
ShowWeights(ostream& out)
{
- // adapted from moses-cmd/Main.cpp
- std::ios::fmtflags old_flags = out.setf(std::ios::fixed);
- size_t old_precision = out.precision(6);
- const vector<const StatelessFeatureFunction*>&
- slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>&
- sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
-
- for (size_t i = 0; i < sff.size(); ++i) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(out,ff);
- }
- else {
- out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(out,ff);
- }
- else {
- out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
- if (! (old_flags & std::ios::fixed))
- out.unsetf(std::ios::fixed);
- out.precision(old_precision);
+ // adapted from moses-cmd/Main.cpp
+ std::ios::fmtflags old_flags = out.setf(std::ios::fixed);
+ size_t old_precision = out.precision(6);
+ const vector<const StatelessFeatureFunction*>&
+ slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ const vector<const StatefulFeatureFunction*>&
+ sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+
+ for (size_t i = 0; i < sff.size(); ++i) {
+ const StatefulFeatureFunction *ff = sff[i];
+ if (ff->IsTuneable()) {
+ PrintFeatureWeight(out,ff);
+ }
+ else {
+ out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
+ }
+ }
+ for (size_t i = 0; i < slf.size(); ++i) {
+ const StatelessFeatureFunction *ff = slf[i];
+ if (ff->IsTuneable()) {
+ PrintFeatureWeight(out,ff);
+ }
+ else {
+ out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
+ }
+ }
+ if (! (old_flags & std::ios::fixed))
+ out.unsetf(std::ios::fixed);
+ out.precision(old_precision);
}
int main(int argc, char** argv)
{
-
- //Extract port and log, send other args to moses
- char** mosesargv = new char*[argc+2];
- int mosesargc = 0;
- int port = 8080;
- const char* logfile = "/dev/null";
- bool isSerial = false;
-
- for (int i = 0; i < argc; ++i) {
- if (!strcmp(argv[i],"--server-port")) {
- ++i;
- if (i >= argc) {
- cerr << "Error: Missing argument to --server-port" << endl;
- exit(1);
- } else {
- port = atoi(argv[i]);
- }
- } else if (!strcmp(argv[i],"--server-log")) {
- ++i;
- if (i >= argc) {
- cerr << "Error: Missing argument to --server-log" << endl;
- exit(1);
- } else {
- logfile = argv[i];
- }
- } else if (!strcmp(argv[i], "--serial")) {
- cerr << "Running single-threaded server" << endl;
- isSerial = true;
- } else {
- mosesargv[mosesargc] = new char[strlen(argv[i])+1];
- strcpy(mosesargv[mosesargc],argv[i]);
- ++mosesargc;
- }
- }
-
- Parameter* params = new Parameter();
- if (!params->LoadParam(mosesargc,mosesargv)) {
- params->Explain();
- exit(1);
- }
- if (!StaticData::LoadDataStatic(params, argv[0])) {
- exit(1);
- }
-
- if (params->isParamSpecified("show-weights")) {
- ShowWeights(cout);
- exit(0);
- }
-
- //512 MB data limit (512KB is not enough for optimization)
- xmlrpc_limit_set(XMLRPC_XML_SIZE_LIMIT_ID, 512*1024*1024);
-
- xmlrpc_c::registry myRegistry;
-
- xmlrpc_c::methodPtr const translator(new Translator);
- xmlrpc_c::methodPtr const updater(new Updater);
- xmlrpc_c::methodPtr const optimizer(new Optimizer);
-
- myRegistry.addMethod("translate", translator);
- myRegistry.addMethod("updater", updater);
- myRegistry.addMethod("optimize", optimizer);
-
- xmlrpc_c::serverAbyss myAbyssServer(
- myRegistry,
- port, // TCP port on which to listen
- logfile
- );
- /* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04
- xmlrpc_c::serverAbyss myAbyssServer(
- xmlrpc_c::serverAbyss::constrOpt()
- .registryPtr(&myRegistry)
- .portNumber(port) // TCP port on which to listen
- .logFileName(logfile)
- .allowOrigin("*")
- );
- */
-
- cerr << "Listening on port " << port << endl;
- if (isSerial) {
- while(1) {
- myAbyssServer.runOnce();
- }
- } else {
- myAbyssServer.run();
- }
- std::cerr << "xmlrpc_c::serverAbyss.run() returned but should not." << std::endl;
- return 1;
+
+ //Extract port and log, send other args to moses
+ char** mosesargv = new char*[argc+2]; // why "+2" [UG]
+ int mosesargc = 0;
+ int port = 8080;
+ const char* logfile = "/dev/null";
+ bool isSerial = false;
+ size_t numThreads = 10; //for translation tasks
+
+ for (int i = 0; i < argc; ++i) {
+ if (!strcmp(argv[i],"--server-port")) {
+ ++i;
+ if (i >= argc) {
+ cerr << "Error: Missing argument to --server-port" << endl;
+ exit(1);
+ } else {
+ port = atoi(argv[i]);
+ }
+ } else if (!strcmp(argv[i],"--server-log")) {
+ ++i;
+ if (i >= argc) {
+ cerr << "Error: Missing argument to --server-log" << endl;
+ exit(1);
+ } else {
+ logfile = argv[i];
+ }
+ } else if (!strcmp(argv[i], "--threads")) {
+ ++i;
+ if (i>=argc) {
+ cerr << "Error: Missing argument to --threads" << endl;
+ exit(1);
+ } else {
+ numThreads = atoi(argv[i]);
+ }
+ } else if (!strcmp(argv[i], "--serial")) {
+ cerr << "Running single-threaded server" << endl;
+ isSerial = true;
+ } else {
+ mosesargv[mosesargc] = new char[strlen(argv[i])+1];
+ strcpy(mosesargv[mosesargc],argv[i]);
+ ++mosesargc;
+ }
+ }
+
+ Parameter* params = new Parameter();
+ if (!params->LoadParam(mosesargc,mosesargv)) {
+ params->Explain();
+ exit(1);
+ }
+ if (!StaticData::LoadDataStatic(params, argv[0])) {
+ exit(1);
+ }
+
+ if (params->isParamSpecified("show-weights")) {
+ ShowWeights(cout);
+ exit(0);
+ }
+
+ //512 MB data limit (512KB is not enough for optimization)
+ xmlrpc_limit_set(XMLRPC_XML_SIZE_LIMIT_ID, 512*1024*1024);
+
+ xmlrpc_c::registry myRegistry;
+
+ xmlrpc_c::methodPtr const translator(new Translator(numThreads));
+ xmlrpc_c::methodPtr const updater(new Updater);
+ xmlrpc_c::methodPtr const optimizer(new Optimizer);
+
+ myRegistry.addMethod("translate", translator);
+ myRegistry.addMethod("updater", updater);
+ myRegistry.addMethod("optimize", optimizer);
+
+ xmlrpc_c::serverAbyss myAbyssServer(
+ myRegistry,
+ port, // TCP port on which to listen
+ logfile
+ );
+ /* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04
+ xmlrpc_c::serverAbyss myAbyssServer(
+ xmlrpc_c::serverAbyss::constrOpt()
+ .registryPtr(&myRegistry)
+ .portNumber(port) // TCP port on which to listen
+ .logFileName(logfile)
+ .allowOrigin("*")
+ );
+ */
+
+ XVERBOSE(1,"Listening on port " << port << endl);
+ if (isSerial) {
+ while(1) myAbyssServer.runOnce();
+ } else {
+ myAbyssServer.run();
+ }
+ std::cerr << "xmlrpc_c::serverAbyss.run() returned but should not." << std::endl;
+ return 1;
}
diff --git a/contrib/sigtest-filter/Makefile b/contrib/sigtest-filter/Makefile
index 71de9c45f..55772929a 100644
--- a/contrib/sigtest-filter/Makefile
+++ b/contrib/sigtest-filter/Makefile
@@ -7,4 +7,4 @@ all: filter-pt
filter-pt: filter-pt.cpp
./check-install $(SALMDIR)
- $(CXX) -O6 $(INC) $(OBJS) -o filter-pt filter-pt.cpp
+ $(CXX) -O6 $(INC) $(OBJS) -o filter-pt filter-pt.cpp -lboost_thread -lboost_system -lpthread -lrt
diff --git a/contrib/sigtest-filter/filter-pt.cpp b/contrib/sigtest-filter/filter-pt.cpp
index 6ab1a5657..bd0b9ae36 100644
--- a/contrib/sigtest-filter/filter-pt.cpp
+++ b/contrib/sigtest-filter/filter-pt.cpp
@@ -4,6 +4,8 @@
#include <cstdio>
#include <cstdlib>
#include <algorithm>
+#include <fstream>
+#include <sstream>
#include "_SuffixArraySearchApplicationBase.h"
@@ -11,18 +13,16 @@
#include <iostream>
#include <set>
+#include <boost/thread/tss.hpp>
+#include <boost/thread.hpp>
+#include <boost/unordered_map.hpp>
+
#ifdef WIN32
#include "WIN32_functions.h"
#else
#include <unistd.h>
#endif
-typedef std::vector<TextLenType> SentIdSet;
-typedef std::pair<SentIdSet, clock_t> ClockedSentIdSet;
-typedef std::map<std::string, ClockedSentIdSet> PhraseSetMap;
-
-#undef min
-
// constants
const size_t MINIMUM_SIZE_TO_KEEP = 10000; // increase this to improve memory usage,
// reduce for speed
@@ -39,12 +39,9 @@ double sig_filter_limit = 0; // keep phrase pairs with -log(sig) > si
// higher = filter-more
bool pef_filter_only = false; // only filter based on pef
bool hierarchical = false;
-int max_cache = 0;
-// globals
-PhraseSetMap esets;
-PhraseSetMap fsets;
double p_111 = 0.0; // alpha
+size_t pt_lines = 0;
size_t nremoved_sigfilter = 0;
size_t nremoved_pfefilter = 0;
@@ -52,6 +49,69 @@ C_SuffixArraySearchApplicationBase e_sa;
C_SuffixArraySearchApplicationBase f_sa;
int num_lines;
+boost::mutex in_mutex;
+boost::mutex out_mutex;
+boost::mutex err_mutex;
+
+typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet;
+
+class Cache {
+ typedef std::pair<SentIdSet, clock_t> ClockedSet;
+ typedef boost::unordered_map<std::string, ClockedSet> ClockedMap;
+
+ public:
+
+ SentIdSet get(const std::string& phrase) {
+ boost::shared_lock<boost::shared_mutex> lock(m_mutex);
+ if(m_cont.count(phrase)) {
+ ClockedSet& set = m_cont[phrase];
+ set.second = clock();
+ return set.first;
+ }
+ return SentIdSet( new SentIdSet::element_type() );
+ }
+
+ void put(const std::string& phrase, const SentIdSet set) {
+ boost::unique_lock<boost::shared_mutex> lock(m_mutex);
+ m_cont[phrase] = std::make_pair(set, clock());
+ }
+
+ static void set_max_cache(size_t max_cache) {
+ s_max_cache = max_cache;
+ }
+
+ void prune() {
+ if(s_max_cache > 0) {
+ boost::upgrade_lock<boost::shared_mutex> lock(m_mutex);
+ if(m_cont.size() > s_max_cache) {
+ std::vector<clock_t> clocks;
+ for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
+ clocks.push_back(it->second.second);
+
+ std::sort(clocks.begin(), clocks.end());
+ clock_t out = clocks[m_cont.size() - s_max_cache];
+
+ boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock);
+ for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
+ if(it->second.second < out)
+ m_cont.erase(it);
+ }
+ }
+ }
+
+ private:
+ ClockedMap m_cont;
+ boost::shared_mutex m_mutex;
+ static size_t s_max_cache;
+};
+
+size_t Cache::s_max_cache = 0;
+
+Cache f_cache;
+Cache e_cache;
+
+#undef min
+
void usage()
{
std::cerr << "\nFilter phrase table using significance testing as described\n"
@@ -59,12 +119,13 @@ void usage()
<< "by Discarding Most of the Phrasetable. EMNLP 2007.\n"
<< "\nUsage:\n"
<< "\n filter-pt -e english.suf-arr -f french.suf-arr\n"
- << " [-c] [-p] [-l threshold] [-n num] < PHRASE-TABLE > FILTERED-PHRASE-TABLE\n\n"
+ << " [-c] [-p] [-l threshold] [-n num] [-t num] < PHRASE-TABLE > FILTERED-PHRASE-TABLE\n\n"
<< " [-l threshold] >0.0, a+e, or a-e: keep values that have a -log significance > this\n"
<< " [-n num ] 0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements\n"
<< " [-c ] add the cooccurence counts to the phrase table\n"
<< " [-p ] add -log(significance) to the phrasetable\n"
<< " [-h ] filter hierarchical rule table\n"
+ << " [-t num ] use num threads\n"
<< " [-m num ] limit cache to num most recent phrases\n";
exit(1);
}
@@ -133,9 +194,6 @@ PTEntry::PTEntry(const std::string& str, int index) :
*fp++=0;
this->pfe = atof(f);
-
- // std::cerr << "L: " << f_phrase << " ::: " << e_phrase << " ::: " << scores << " ::: " << pfe << std::endl;
- // std::cerr << "X: " << extra << "\n";
}
struct PfeComparer {
@@ -168,7 +226,8 @@ std::ostream& operator << (std::ostream& os, const PTEntry& pp)
void print(int a, int b, int c, int d, float p)
{
std::cerr << a << "\t" << b << "\t P=" << p << "\n"
- << c << "\t" << d << "\t xf=" << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
+ << c << "\t" << d << "\t xf="
+ << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
}
// 2x2 (one-sided) Fisher's exact test
@@ -184,13 +243,13 @@ double fisher_exact(int cfe, int ce, int cf)
int d = (num_lines - ce - cf + cfe);
int n = a + b + c + d;
- double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d) - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c) - lgamma(1+d));
+ double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d)
+ - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c)
+ - lgamma(1+d));
double total_p = 0.0;
int tc = std::min(b,c);
for (int i=0; i<=tc; i++) {
total_p += cp;
-// double lg = lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d) - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c) - lgamma(1+d); double cp = exp(lg);
-// print(a,b,c,d,cp);
double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1);
cp *= coef;
++a;
@@ -202,86 +261,73 @@ double fisher_exact(int cfe, int ce, int cf)
}
template <class setType>
-setType ordered_set_intersect(setType & set_1, setType & set_2)
+void ordered_set_intersect(setType& out, const setType set_1, const setType set_2)
{
- setType set_out;
- std::set_intersection(set_1.begin(), set_1.end(), set_2.begin(), set_2.end(), inserter(set_out,set_out.begin()) );
- return set_out;
+ std::set_intersection(set_1->begin(), set_1->end(), set_2->begin(),
+ set_2->end(), inserter(*out, out->begin()) );
}
-SentIdSet lookup_phrase(const std::string & phrase, C_SuffixArraySearchApplicationBase & my_sa)
+void lookup_phrase(SentIdSet& ids, const std::string& phrase,
+ C_SuffixArraySearchApplicationBase & my_sa, Cache& cache)
{
- SentIdSet occur_set;
- vector<S_SimplePhraseLocationElement> locations;
-
- locations = my_sa.locateExactPhraseInCorpus(phrase.c_str());
- if(locations.size()==0) {
- cerr<<"No occurrences found!!\n";
- }
- for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i != locations.end(); ++i) {
- occur_set.push_back(i->sentIdInCorpus);
+ ids = cache.get(phrase);
+ if(ids->empty()) {
+ vector<S_SimplePhraseLocationElement> locations;
+ locations = my_sa.locateExactPhraseInCorpus(phrase.c_str());
+ if(locations.size()==0) {
+ cerr<<"No occurrences found!!\n";
+ }
+ for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin();
+ i != locations.end(); ++i) {
+ ids->push_back(i->sentIdInCorpus);
+ }
+
+ std::sort(ids->begin(), ids->end());
+ SentIdSet::element_type::iterator it =
+ std::unique(ids->begin(), ids->end());
+ ids->resize(it - ids->begin());
+
+ if(ids->size() >= MINIMUM_SIZE_TO_KEEP)
+ cache.put(phrase, ids);
}
-
- std::sort(occur_set.begin(), occur_set.end());
- SentIdSet::iterator it = std::unique(occur_set.begin(), occur_set.end());
- occur_set.resize(it - occur_set.begin());
-
- return occur_set;
}
-
-// slight simplicifaction: we consider all sentences in which "a" and "b" occur to be instances of the rule "a [X][X] b".
-SentIdSet lookup_multiple_phrases(vector<std::string> & phrases, C_SuffixArraySearchApplicationBase & my_sa, const std::string & rule, PhraseSetMap & cache)
-{
+void lookup_multiple_phrases(SentIdSet& ids, vector<std::string> & phrases,
+ C_SuffixArraySearchApplicationBase & my_sa,
+ const std::string & rule, Cache& cache)
+{
if (phrases.size() == 1) {
- return lookup_phrase(phrases.front(), my_sa);
+ lookup_phrase(ids, phrases.front(), my_sa, cache);
}
-
else {
- SentIdSet main_set;
- ClockedSentIdSet & clocked_first_set = cache[phrases.front()];
- SentIdSet & first_set = clocked_first_set.first;
- clocked_first_set.second = clock();
-
+ SentIdSet main_set( new SentIdSet::element_type() );
bool first = true;
- if (first_set.empty()) {
- first_set = lookup_phrase(phrases.front(), my_sa);
- }
- for (vector<std::string>::iterator phrase=phrases.begin()+1; phrase != phrases.end(); ++phrase) {
- ClockedSentIdSet & clocked_temp_set = cache[*phrase];
- SentIdSet & temp_set = clocked_temp_set.first;
- clocked_temp_set.second = clock();
-
- if (temp_set.empty()) {
- temp_set = lookup_phrase(*phrase, my_sa);
- }
+ SentIdSet first_set( new SentIdSet::element_type() );
+ lookup_phrase(first_set, phrases.front(), my_sa, cache);
+ for (vector<std::string>::iterator phrase=phrases.begin()+1;
+ phrase != phrases.end(); ++phrase) {
+ SentIdSet temp_set( new SentIdSet::element_type() );
+ lookup_phrase(temp_set, *phrase, my_sa, cache);
if (first) {
- main_set = ordered_set_intersect(first_set,temp_set);
+ ordered_set_intersect(main_set, first_set, temp_set);
first = false;
}
else {
- main_set = ordered_set_intersect(main_set,temp_set);
- }
- if (temp_set.size() < MINIMUM_SIZE_TO_KEEP) {
- cache.erase(*phrase);
+ SentIdSet new_set( new SentIdSet::element_type() );
+ ordered_set_intersect(new_set, main_set, temp_set);
+ main_set->swap(*new_set);
}
}
-
- if (first_set.size() < MINIMUM_SIZE_TO_KEEP) {
- cache.erase(phrases.front());
- }
-
- return main_set;
+ ids->swap(*main_set);
}
}
-SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicationBase & my_sa, PhraseSetMap & cache)
+void find_occurrences(SentIdSet& ids, const std::string& rule,
+ C_SuffixArraySearchApplicationBase& my_sa, Cache& cache)
{
- SentIdSet sa_set;
-
// we search for hierarchical rules by stripping away NT and looking for terminals sequences
// if a rule contains multiple sequences of terminals, we intersect their occurrences.
if (hierarchical) {
@@ -305,76 +351,142 @@ SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicati
phrases.push_back(rule.substr(pos,NTStartPos-pos));
}
- sa_set = lookup_multiple_phrases(phrases, my_sa, rule, cache);
+ lookup_multiple_phrases(ids, phrases, my_sa, rule, cache);
}
else {
- sa_set = lookup_phrase(rule, my_sa);
+ lookup_phrase(ids, rule, my_sa, cache);
}
- return sa_set;
}
// input: unordered list of translation options for a single source phrase
-void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
+void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
+ Cache& f_cache, Cache& e_cache)
{
- if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) {
+ if (pfe_filter_limit > 0 && options.size() > pfe_filter_limit) {
nremoved_pfefilter += (options.size() - pfe_filter_limit);
- std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer());
- for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i)
+ std::nth_element(options.begin(), options.begin() + pfe_filter_limit,
+ options.end(), PfeComparer());
+ for (std::vector<PTEntry*>::iterator i = options.begin() + pfe_filter_limit;
+ i != options.end(); ++i)
delete *i;
- options.erase(options.begin()+pfe_filter_limit,options.end());
+ options.erase(options.begin() + pfe_filter_limit,options.end());
}
- if (pef_filter_only) return;
-// std::cerr << "f phrase: " << options.front()->f_phrase << "\n";
- SentIdSet fset;
- fset = find_occurrences(options.front()->f_phrase, f_sa, fsets);
- size_t cf = fset.size();
- for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
+
+ if (pef_filter_only)
+ return;
+
+ if (options.empty())
+ return;
+
+ SentIdSet fset( new SentIdSet::element_type() );
+ find_occurrences(fset, options.front()->f_phrase, f_sa, f_cache);
+ size_t cf = fset->size();
+
+ for (std::vector<PTEntry*>::iterator i = options.begin();
+ i != options.end(); ++i) {
const std::string& e_phrase = (*i)->e_phrase;
- size_t cef=0;
- ClockedSentIdSet& clocked_eset = esets[e_phrase];
- SentIdSet & eset = clocked_eset.first;
- clocked_eset.second = clock();
- if (eset.empty()) {
- eset = find_occurrences(e_phrase, e_sa, esets);
- //std::cerr << "Looking up e-phrase: " << e_phrase << "\n";
- }
- size_t ce=eset.size();
- if (ce < cf) {
- for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) {
- if (std::binary_search(fset.begin(), fset.end(), *i)) cef++;
- }
- } else {
- for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) {
- if (std::binary_search(eset.begin(), eset.end(), *i)) cef++;
- }
- }
+ SentIdSet eset( new SentIdSet::element_type() );
+ find_occurrences(eset, e_phrase, e_sa, e_cache);
+ size_t ce = eset->size();
+
+ SentIdSet efset( new SentIdSet::element_type() );
+ ordered_set_intersect(efset, fset, eset);
+ size_t cef = efset->size();
+
double nlp = -log(fisher_exact(cef, cf, ce));
(*i)->set_cooc_stats(cef, cf, ce, nlp);
- if (ce < MINIMUM_SIZE_TO_KEEP) {
- esets.erase(e_phrase);
- }
-
}
+
std::vector<PTEntry*>::iterator new_end =
- std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit));
+ std::remove_if(options.begin(), options.end(),
+ NlogSigThresholder(sig_filter_limit));
nremoved_sigfilter += (options.end() - new_end);
options.erase(new_end,options.end());
}
-void prune_cache(PhraseSetMap & psm) {
- if(max_cache && psm.size() > max_cache) {
- std::vector<clock_t> clocks;
- for(PhraseSetMap::iterator it = psm.begin(); it != psm.end(); it++)
- clocks.push_back(it->second.second);
-
- std::sort(clocks.begin(), clocks.end());
- clock_t out = clocks[psm.size()-max_cache];
+void filter(std::istream* in, std::ostream* out, int pfe_index) {
+
+ std::vector<std::string> lines;
+ std::string prev = "";
+ std::vector<PTEntry*> options;
+ while(true) {
+ {
+ boost::mutex::scoped_lock lock(in_mutex);
+ if(in->eof())
+ break;
+
+ lines.clear();
+ std::string line;
+ while(getline(*in, line) && lines.size() < 500000)
+ lines.push_back(line);
+ }
- for(PhraseSetMap::iterator it = psm.begin(); it != psm.end(); it++)
- if(it->second.second < out)
- psm.erase(it);
+ std::stringstream out_temp;
+ for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) {
+ size_t tmp_lines = ++pt_lines;
+ if(tmp_lines % 10000 == 0) {
+ boost::mutex::scoped_lock lock(err_mutex);
+ std::cerr << ".";
+
+ if(tmp_lines % 500000 == 0)
+ std::cerr << "[n:" << tmp_lines << "]\n";
+
+ if(tmp_lines % 10000000 == 0) {
+ float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
+ float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
+ std::cerr << "------------------------------------------------------\n"
+ << " unfiltered phrases pairs: " << pt_lines << "\n"
+ << "\n"
+ << " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n"
+ << " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n"
+ << " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
+ << "\n"
+ << " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
+ << "------------------------------------------------------\n";
+ }
+ }
+
+ if(pt_lines % 10000 == 0) {
+ f_cache.prune();
+ e_cache.prune();
+ }
+
+ if(it->length() > 0) {
+ PTEntry* pp = new PTEntry(it->c_str(), pfe_index);
+ if (prev != pp->f_phrase) {
+ prev = pp->f_phrase;
+
+ if (!options.empty()) { // always true after first line
+ compute_cooc_stats_and_filter(options, f_cache, e_cache);
+ }
+
+ for (std::vector<PTEntry*>::iterator i = options.begin();
+ i != options.end(); ++i) {
+ out_temp << **i << '\n';
+ delete *i;
+ }
+
+ options.clear();
+ options.push_back(pp);
+
+ } else {
+ options.push_back(pp);
+ }
+ }
+ }
+ boost::mutex::scoped_lock lock(out_mutex);
+ *out << out_temp.str() << std::flush;
+ }
+ compute_cooc_stats_and_filter(options, f_cache, e_cache);
+
+ boost::mutex::scoped_lock lock(out_mutex);
+ for (std::vector<PTEntry*>::iterator i = options.begin();
+ i != options.end(); ++i) {
+ *out << **i << '\n';
+ delete *i;
}
+ *out << std::flush;
}
int main(int argc, char * argv[])
@@ -383,7 +495,9 @@ int main(int argc, char * argv[])
const char* efile=0;
const char* ffile=0;
int pfe_index = 2;
- while ((c = getopt(argc, argv, "cpf:e:i:n:l:m:h")) != -1) {
+ int threads = 1;
+ size_t max_cache = 0;
+ while ((c = getopt(argc, argv, "cpf:e:i:n:t:l:m:h")) != -1) {
switch (c) {
case 'e':
efile = optarg;
@@ -398,6 +512,14 @@ int main(int argc, char * argv[])
pfe_filter_limit = atoi(optarg);
std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
break;
+ case 't':
+ threads = atoi(optarg);
+ std::cerr << "Using threads: " << threads << std::endl;
+ break;
+ case 'm':
+ max_cache = atoi(optarg);
+ std::cerr << "Using max phrases in caches: " << max_cache << std::endl;
+ break;
case 'c':
print_cooc_counts = true;
break;
@@ -407,9 +529,6 @@ int main(int argc, char * argv[])
case 'h':
hierarchical = true;
break;
- case 'm':
- max_cache = atoi(optarg);
- break;
case 'l':
std::cerr << "-l = " << optarg << "\n";
if (strcmp(optarg,"a+e") == 0) {
@@ -429,12 +548,13 @@ int main(int argc, char * argv[])
usage();
}
}
+
if (sig_filter_limit == 0.0) pef_filter_only = true;
//-----------------------------------------------------------------------------
if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
usage();
}
-
+
//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
if (!pef_filter_only) {
e_sa.loadData_forSearch(efile, false, false);
@@ -460,52 +580,17 @@ int main(int argc, char * argv[])
std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl;
}
- char tmpString[10000];
- std::string prev = "";
- std::vector<PTEntry*> options;
- size_t pt_lines = 0;
- while(!cin.eof()) {
- cin.getline(tmpString,10000,'\n');
- if(++pt_lines%10000==0) {
- std::cerr << ".";
-
- prune_cache(esets);
- prune_cache(fsets);
-
- if(pt_lines%500000==0)
- std::cerr << "[n:"<<pt_lines<<"]\n";
- }
+ Cache::set_max_cache(max_cache);
+ std::ios_base::sync_with_stdio(false);
+
+ boost::thread_group threadGroup;
+ for(int i = 0; i < threads; i++)
+ threadGroup.add_thread(new boost::thread(filter, &std::cin, &std::cout, pfe_index));
+ threadGroup.join_all();
- if(strlen(tmpString)>0) {
- PTEntry* pp = new PTEntry(tmpString, pfe_index);
- if (prev != pp->f_phrase) {
- prev = pp->f_phrase;
-
- if (!options.empty()) { // always true after first line
- compute_cooc_stats_and_filter(options);
- }
- for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
- std::cout << **i << std::endl;
- delete *i;
- }
- options.clear();
- options.push_back(pp);
-
- } else {
- options.push_back(pp);
- }
- // for(int i=0;i<locations.size(); i++){
- // cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
- // }
- }
- }
- compute_cooc_stats_and_filter(options);
- for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
- std::cout << **i << std::endl;
- delete *i;
- }
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
+
std::cerr << "\n\n------------------------------------------------------\n"
<< " unfiltered phrases pairs: " << pt_lines << "\n"
<< "\n"
@@ -514,7 +599,5 @@ int main(int argc, char * argv[])
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
<< "\n"
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
- << "------------------------------------------------------\n";
-
- return 0;
+ << "------------------------------------------------------\n";
}
diff --git a/contrib/tmcombine/test/model5/model/lex.counts.e2f b/contrib/tmcombine/test/model5/model/lex.counts.e2f
new file mode 100644
index 000000000..ed05c0b7d
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/lex.counts.e2f
@@ -0,0 +1,8 @@
+ad af 500 1000
+bd bf 5 10
+der le 20285 102586
+der NULL 12926 704917
+gipfel sommet 3485 7322
+pass col 419 2911
+pass passeport 7 28
+sitzung séance 14 59 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model5/model/lex.counts.f2e b/contrib/tmcombine/test/model5/model/lex.counts.f2e
new file mode 100644
index 000000000..ea31f690d
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/lex.counts.f2e
@@ -0,0 +1,8 @@
+af ad 500 1000
+bf bd 5 10
+col pass 419 615
+le der 20285 113635
+passeport pass 7 615
+retrouvé NULL 34 1016136
+séance sitzung 14 33
+sommet gipfel 3485 5700 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model5/model/lex.e2f b/contrib/tmcombine/test/model5/model/lex.e2f
new file mode 100644
index 000000000..f9263ffe5
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/lex.e2f
@@ -0,0 +1,8 @@
+ad af 0.5
+bd bf 0.5
+der le 0.1977365
+der NULL 0.0183369
+gipfel sommet 0.4759629
+pass col 0.1439368
+pass passeport 0.2500000
+sitzung séance 0.2372881 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model5/model/lex.f2e b/contrib/tmcombine/test/model5/model/lex.f2e
new file mode 100644
index 000000000..2bba51f01
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/lex.f2e
@@ -0,0 +1,8 @@
+af ad 0.5
+bf bd 0.5
+col pass 0.6813008
+le der 0.1785101
+passeport pass 0.0113821
+retrouvé NULL 0.0000335
+séance sitzung 0.4242424
+sommet gipfel 0.6114035 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model5/model/phrase-table b/contrib/tmcombine/test/model5/model/phrase-table
new file mode 100644
index 000000000..5621b5acf
--- /dev/null
+++ b/contrib/tmcombine/test/model5/model/phrase-table
@@ -0,0 +1,8 @@
+ad [X][X] [X] ||| af [X][X] [X] ||| 0.5 0.5 0.5 0.5 2.718 ||| 0-0 1-1 ||| 1000 1000
+bd [X] ||| bf [X] ||| 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 10 10
+der gipfel [X] ||| sommet [X] ||| 0.00327135 0.00872768 0.0366795 0.611403 2.718 ||| 1-0 ||| 5808 518
+der [X][X] pass [X] ||| le [X][X] col [X] ||| 0.0173565 0.0284616 0.288889 0.121619 2.718 ||| 0-0 1-1 2-2 ||| 749 45
+pass [X] ||| col [X] ||| 0.1952 0.143937 0.628866 0.681301 2.718 ||| 0-0 ||| 1875 582
+pass [X] ||| passeport retrouvé [X] ||| 0.5 0.25 0.00171821 3.813e-07 2.718 ||| 0-0 ||| 2 582
+pass [X] ||| passeport [X] ||| 0.266667 0.25 0.00687285 0.0113821 2.718 ||| 0-0 ||| 15 582
+[X][X] sitzung [X] ||| [X][X] séance [X] ||| 0.272727 0.237288 0.352941 0.424242 2.718 ||| 0-0 1-1 ||| 22 17 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/lex.counts.e2f b/contrib/tmcombine/test/model6/model/lex.counts.e2f
new file mode 100644
index 000000000..8475fcdf9
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/lex.counts.e2f
@@ -0,0 +1,8 @@
+ad af 100 1000
+bd bf 1 10
+der le 150181 944391
+der NULL 54483 3595140
+gipfel sommet 3421 9342
+pass col 2 70
+pass passeport 73 379
+sitzung séance 3441 5753 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/lex.counts.f2e b/contrib/tmcombine/test/model6/model/lex.counts.f2e
new file mode 100644
index 000000000..b0913088a
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/lex.counts.f2e
@@ -0,0 +1,8 @@
+af ad 100 1000
+bf bd 1 10
+col pass 2 108
+le der 150181 1356104
+passeport pass 73 108
+retrouvé NULL 43 6276240
+séance sitzung 3441 6142
+sommet gipfel 3421 4908 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/lex.e2f b/contrib/tmcombine/test/model6/model/lex.e2f
new file mode 100644
index 000000000..b1ce3a613
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/lex.e2f
@@ -0,0 +1,8 @@
+ad af 0.1
+bd bf 0.1
+der le 0.1590242
+der NULL 0.0151546
+gipfel sommet 0.366195
+pass col 0.0285714
+pass passeport 0.1926121
+sitzung séance 0.5981227 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/lex.f2e b/contrib/tmcombine/test/model6/model/lex.f2e
new file mode 100644
index 000000000..d931dcb72
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/lex.f2e
@@ -0,0 +1,8 @@
+af ad 0.1
+bf bd 0.1
+col pass 0.0185185
+le der 0.1107445
+passeport pass 0.6759259
+retrouvé NULL 0.0000069
+séance sitzung 0.5602410
+sommet gipfel 0.6970253 \ No newline at end of file
diff --git a/contrib/tmcombine/test/model6/model/phrase-table b/contrib/tmcombine/test/model6/model/phrase-table
new file mode 100644
index 000000000..9c260f171
--- /dev/null
+++ b/contrib/tmcombine/test/model6/model/phrase-table
@@ -0,0 +1,5 @@
+ad [X][X] [X] ||| af [X][X] [X] ||| 0.1 0.1 0.1 0.1 2.718 ||| 0-0 1-1 ||| 1000 1000
+bd [X] ||| bf [X] ||| 0.1 0.1 0.1 0.1 2.718 ||| 0-0 ||| 10 10
+der [X][X] pass [X] ||| le [X][X] passeport [X] ||| 0.16 0.03063 0.4 0.0748551 2.718 ||| 0-0 1-1 2-2 ||| 25 10
+pass [X] ||| passeport [X] ||| 0.28022 0.192612 0.607143 0.675926 2.718 ||| 0-0 ||| 182 84
+[X][X] sitzung [X] ||| [X][X] séance [X] ||| 0.784521 0.598123 0.516654 0.560241 2.718 ||| 0-0 1-1 ||| 4251 6455 \ No newline at end of file
diff --git a/contrib/tmcombine/test/phrase-table_test11 b/contrib/tmcombine/test/phrase-table_test11
new file mode 100644
index 000000000..128cf07d9
--- /dev/null
+++ b/contrib/tmcombine/test/phrase-table_test11
@@ -0,0 +1,9 @@
+ad [X][X] [X] ||| af [X][X] [X] ||| 0.14 0.136364 0.18 0.3 ||| 0-0 1-1 ||| 10000.0 5000.0
+bd [X] ||| bf [X] ||| 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 100.0 50.0
+der [X][X] pass [X] ||| le [X][X] passeport [X] ||| 0.16 0.0307772 0.4 0.0128336 ||| 0-0 1-1 2-2 ||| 225.0 40.0
+der gipfel [X] ||| sommet [X] ||| 0.00327135 0.00569336 0.0366795 0.651018 ||| 1-0 ||| 5808.0 518.0
+der [X][X] pass [X] ||| le [X][X] col [X] ||| 0.0173565 0.0193836 0.288889 0.0675369 ||| 0-0 1-1 2-2 ||| 749.0 45.0
+pass [X] ||| col [X] ||| 0.1952 0.121573 0.398693 0.582296 ||| 0-0 ||| 1875.0 918.0
+pass [X] ||| passeport [X] ||| 0.280097 0.193033 0.22658 0.11065 ||| 0-0 ||| 1653.0 918.0
+pass [X] ||| passeport retrouvé [X] ||| 0.5 0.193033 0.00108932 1.16835e-06 ||| 0-0 ||| 2.0 918.0
+[X][X] sitzung [X] ||| [X][X] séance [X] ||| 0.784227 0.597753 0.516546 0.559514 ||| 0-0 1-1 ||| 38281.0 25837.0
diff --git a/contrib/tmcombine/tmcombine.py b/contrib/tmcombine/tmcombine.py
index b512188d7..467a24e19 100755
--- a/contrib/tmcombine/tmcombine.py
+++ b/contrib/tmcombine/tmcombine.py
@@ -1176,6 +1176,9 @@ def compute_lexicalweight(weights,alignment,word_pairs,marginal,mode='counts',ca
mycache[1] = defaultdict(dict)
for x,translations in alignment:
+ # skip nonterminals
+ if x.startswith(b'['):
+ continue
if cache and translations in mycache[1][x]:
lex_step = mycache[1][x][translations]
@@ -1870,7 +1873,12 @@ def test():
sys.stderr.write('Regression test 10\n')
Combiner = Combine_TMs([[os.path.join('test','model3'),'primary'],[os.path.join('test','model4'),'primary']],output_file=os.path.join('test','phrase-table_test10'),mode='counts',number_of_features=8,i_e2f=4,i_e2f_lex=5,i_f2e=6,i_f2e_lex=7,reference_file='test/extract')
Combiner.combine_given_tuning_set()
-
+
+ # count-based combination of two hierarchical models, with fixed weights. Same as test 3, but with hierarchical models
+ # command line: python tmcombine.py combine_given_weights test/model5 test/model6 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test11 -m counts
+ sys.stderr.write('Regression test 11\n')
+ Combiner = Combine_TMs([[os.path.join('test','model5'),'primary'],[os.path.join('test','model6'),'primary']],[[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test11'),mode='counts')
+ Combiner.combine_given_weights()
#convert weight vector passed as a command line argument
class to_list(argparse.Action):
diff --git a/doc/PhraseDictionaryBitextSampling.howto b/doc/PhraseDictionaryBitextSampling.howto
new file mode 100644
index 000000000..69ab11b5b
--- /dev/null
+++ b/doc/PhraseDictionaryBitextSampling.howto
@@ -0,0 +1,4 @@
+The documentation for memory-mapped, dynamic suffix arrays has moved to
+ http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40
+
+Search for PhraseDictionaryBitextSampling.
diff --git a/jam-files/sanity.jam b/jam-files/sanity.jam
index 79fd0ce1c..9b4d5873d 100644
--- a/jam-files/sanity.jam
+++ b/jam-files/sanity.jam
@@ -58,7 +58,7 @@ if $(FORCE-STATIC) {
rule test_library ( name ) {
if $(FORCE-STATIC) {
- return [ test_flags "-l$(name) -static" ] ;
+ return [ test_flags "-Wl,-Bstatic -l$(name) -Wl,-Bdynamic" ] ;
} else {
return [ test_flags "-l$(name)" ] ;
}
@@ -88,7 +88,7 @@ rule auto-shared ( name : additional * ) {
if $(shared-command-line) = "<link>shared" {
return "<link>shared" ;
} else {
- if [ test_flags $(additional)" -static -l"$(name) ] {
+ if [ test_flags $(additional)" -Wl,-Bstatic -l"$(name)" -Wl,-Bdynamic" ] {
return ;
} else {
if $(FORCE-STATIC) {
@@ -131,10 +131,7 @@ if $(with-macports) {
#Convenience rule for boost libraries. Defines library boost_$(name).
rule boost-lib ( name macro : deps * ) {
- #Link multi-threaded programs against the -mt version if available. Old
- #versions of boost do not have -mt tagged versions of all libraries. Sadly,
- #boost.jam does not handle this correctly.
- flags = $(L-boost-search)" -lboost_"$(name)"-mt$(boost-lib-version)" ;
+ flags = $(L-boost-search)" -lboost_"$(name)"$(boost-lib-version)" ;
local main ;
if $(name) = "unit_test_framework" {
main = "BOOST_AUTO_TEST_CASE(foo) {}" ;
@@ -143,11 +140,11 @@ rule boost-lib ( name macro : deps * ) {
if $(boost-auto-shared) = "<link>shared" {
flags += " -DBOOST_$(macro)" ;
} else {
- flags += " -static" ;
+ flags = " -Wl,-Bstatic $(flags) -Wl,-Bdynamic " ;
}
if [ test_flags $(flags) : $(main) ] {
lib inner_boost_$(name) : : <threading>single $(boost-search) <name>boost_$(name)$(boost-lib-version) : <link>static : <library>$(deps) ;
- lib inner_boost_$(name) : : <threading>multi $(boost-search) <name>boost_$(name)-mt$(boost-lib-version) : <link>static : <library>$(deps) ;
+ lib inner_boost_$(name) : : <threading>multi $(boost-search) <name>boost_$(name)$(boost-lib-version) : <link>static : <library>$(deps) ;
} else {
lib inner_boost_$(name) : : $(boost-search) <name>boost_$(name)$(boost-lib-version) : : <library>$(deps) ;
}
diff --git a/lm/Jamfile b/lm/Jamfile
index 4693f9e01..edc3751a7 100644
--- a/lm/Jamfile
+++ b/lm/Jamfile
@@ -14,7 +14,7 @@ update-if-changed $(ORDER-LOG) $(max-order) ;
max-order += <dependency>$(ORDER-LOG) ;
wrappers = ;
-local with-nplm = [ option.get "with-nplm" ] ;
+local with-nplm = [ option.get "with-nplm-0.1" ] ;
if $(with-nplm) {
lib neuralLM : : <search>$(with-nplm)/src ;
obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ;
@@ -37,4 +37,4 @@ for local p in [ glob *_main.cc ] {
exes += $(name) ;
}
-alias programs : $(exes) filter//filter : <threading>multi:<source>builder//lmplz ;
+alias programs : $(exes) filter//filter builder//dump_counts : <threading>multi:<source>builder//lmplz ;
diff --git a/lm/builder/Jamfile b/lm/builder/Jamfile
index b596e086a..1e0e18b5f 100644
--- a/lm/builder/Jamfile
+++ b/lm/builder/Jamfile
@@ -4,6 +4,10 @@ fakelib builder : [ glob *.cc : *test.cc *main.cc ]
exe lmplz : lmplz_main.cc builder /top//boost_program_options ;
+exe dump_counts : dump_counts_main.cc builder ;
+
+alias programs : lmplz dump_counts ;
+
import testing ;
unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ;
unit-test adjust_counts_test : adjust_counts_test.cc builder /top//boost_unit_test_framework ;
diff --git a/lm/builder/adjust_counts.cc b/lm/builder/adjust_counts.cc
index 080b438a4..803c557d0 100644
--- a/lm/builder/adjust_counts.cc
+++ b/lm/builder/adjust_counts.cc
@@ -29,28 +29,44 @@ class StatCollector {
~StatCollector() {}
- void CalculateDiscounts() {
+ void CalculateDiscounts(const DiscountConfig &config) {
counts_.resize(orders_.size());
counts_pruned_.resize(orders_.size());
- discounts_.resize(orders_.size());
for (std::size_t i = 0; i < orders_.size(); ++i) {
const OrderStat &s = orders_[i];
counts_[i] = s.count;
counts_pruned_[i] = s.count_pruned;
+ }
- for (unsigned j = 1; j < 4; ++j) {
- // TODO: Specialize error message for j == 3, meaning 3+
- UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for "
- << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any "
- << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?");
- }
-
- // See equation (26) in Chen and Goodman.
- discounts_[i].amount[0] = 0.0;
- float y = static_cast<float>(s.n[1]) / static_cast<float>(s.n[1] + 2.0 * s.n[2]);
- for (unsigned j = 1; j < 4; ++j) {
- discounts_[i].amount[j] = static_cast<float>(j) - static_cast<float>(j + 1) * y * static_cast<float>(s.n[j+1]) / static_cast<float>(s.n[j]);
- UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j]);
+ discounts_ = config.overwrite;
+ discounts_.resize(orders_.size());
+ for (std::size_t i = config.overwrite.size(); i < orders_.size(); ++i) {
+ const OrderStat &s = orders_[i];
+ try {
+ for (unsigned j = 1; j < 4; ++j) {
+ // TODO: Specialize error message for j == 3, meaning 3+
+ UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for "
+ << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any "
+ << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?");
+ }
+
+ // See equation (26) in Chen and Goodman.
+ discounts_[i].amount[0] = 0.0;
+ float y = static_cast<float>(s.n[1]) / static_cast<float>(s.n[1] + 2.0 * s.n[2]);
+ for (unsigned j = 1; j < 4; ++j) {
+ discounts_[i].amount[j] = static_cast<float>(j) - static_cast<float>(j + 1) * y * static_cast<float>(s.n[j+1]) / static_cast<float>(s.n[j]);
+ UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j]);
+ }
+ } catch (const BadDiscountException &e) {
+ switch (config.bad_action) {
+ case THROW_UP:
+ throw;
+ case COMPLAIN:
+ std::cerr << e.what() << " Substituting fallback discounts D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl;
+ case SILENT:
+ break;
+ }
+ discounts_[i] = config.fallback;
}
}
}
@@ -179,7 +195,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
for (NGramStream full(positions[0]); full; ++full)
stats.AddFull(full->Count());
- stats.CalculateDiscounts();
+ stats.CalculateDiscounts(discount_config_);
return;
}
@@ -262,7 +278,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
for (NGramStream *s = streams.begin(); s != streams.end(); ++s)
s->Poison();
- stats.CalculateDiscounts();
+ stats.CalculateDiscounts(discount_config_);
// NOTE: See special early-return case for unigrams near the top of this function
}
diff --git a/lm/builder/adjust_counts.hh b/lm/builder/adjust_counts.hh
index 60198e8f8..a5435c282 100644
--- a/lm/builder/adjust_counts.hh
+++ b/lm/builder/adjust_counts.hh
@@ -2,6 +2,7 @@
#define LM_BUILDER_ADJUST_COUNTS_H
#include "lm/builder/discount.hh"
+#include "lm/lm_exception.hh"
#include "util/exception.hh"
#include <vector>
@@ -19,6 +20,16 @@ class BadDiscountException : public util::Exception {
~BadDiscountException() throw();
};
+struct DiscountConfig {
+ // Overrides discounts for orders [1,discount_override.size()].
+ std::vector<Discount> overwrite;
+ // If discounting fails for an order, copy them from here.
+ Discount fallback;
+ // What to do when discounts are out of range or would trigger divison by
+ // zero. It it does something other than THROW_UP, use fallback_discount.
+ WarningAction bad_action;
+};
+
/* Compute adjusted counts.
* Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
* Output: [1,N]-grams with adjusted counts.
@@ -27,17 +38,28 @@ class BadDiscountException : public util::Exception {
*/
class AdjustCounts {
public:
- AdjustCounts(std::vector<uint64_t> &counts, std::vector<uint64_t> &counts_pruned, std::vector<Discount> &discounts, std::vector<uint64_t> &prune_thresholds)
- : counts_(counts), counts_pruned_(counts_pruned), discounts_(discounts), prune_thresholds_(prune_thresholds)
+ // counts: output
+ // counts_pruned: output
+ // discounts: mostly output. If the input already has entries, they will be kept.
+ // prune_thresholds: input. n-grams with normal (not adjusted) count below this will be pruned.
+ AdjustCounts(
+ const std::vector<uint64_t> &prune_thresholds,
+ std::vector<uint64_t> &counts,
+ std::vector<uint64_t> &counts_pruned,
+ const DiscountConfig &discount_config,
+ std::vector<Discount> &discounts)
+ : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), discount_config_(discount_config), discounts_(discounts)
{}
void Run(const util::stream::ChainPositions &positions);
private:
+ const std::vector<uint64_t> &prune_thresholds_;
std::vector<uint64_t> &counts_;
std::vector<uint64_t> &counts_pruned_;
+
+ DiscountConfig discount_config_;
std::vector<Discount> &discounts_;
- std::vector<uint64_t> &prune_thresholds_;
};
} // namespace builder
diff --git a/lm/builder/adjust_counts_test.cc b/lm/builder/adjust_counts_test.cc
index 9d8ef65b6..073c5dfeb 100644
--- a/lm/builder/adjust_counts_test.cc
+++ b/lm/builder/adjust_counts_test.cc
@@ -75,7 +75,10 @@ BOOST_AUTO_TEST_CASE(Simple) {
chains >> util::stream::kRecycle;
std::vector<uint64_t> counts_pruned(4);
std::vector<uint64_t> prune_thresholds(4);
- BOOST_CHECK_THROW(AdjustCounts(counts, counts_pruned, discount, prune_thresholds).Run(for_adjust), BadDiscountException);
+ DiscountConfig discount_config;
+ discount_config.fallback = Discount();
+ discount_config.bad_action = THROW_UP;
+ BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, discount_config, discount).Run(for_adjust), BadDiscountException);
}
BOOST_REQUIRE_EQUAL(4UL, counts.size());
BOOST_CHECK_EQUAL(4UL, counts[0]);
diff --git a/lm/builder/dump_counts_main.cc b/lm/builder/dump_counts_main.cc
new file mode 100644
index 000000000..fa0016792
--- /dev/null
+++ b/lm/builder/dump_counts_main.cc
@@ -0,0 +1,36 @@
+#include "lm/builder/print.hh"
+#include "lm/word_index.hh"
+#include "util/file.hh"
+#include "util/read_compressed.hh"
+
+#include <boost/lexical_cast.hpp>
+
+#include <iostream>
+#include <vector>
+
+int main(int argc, char *argv[]) {
+ if (argc != 4) {
+ std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
+ "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
+ "counts. Each record has order many vocabulary ids.\n"
+ "The vocabulary file contains the words delimited by NULL in order of id.\n"
+ "The vocabulary file may not be compressed because it is mmapped but the counts\n"
+ "file can be compressed.\n";
+ return 1;
+ }
+ util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
+ util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
+ lm::builder::VocabReconstitute vocab(vocab_file.get());
+ unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
+ std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
+ while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
+ UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
+ const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
+ for (const lm::WordIndex *i = words; i != words + order; ++i) {
+ UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?");
+ std::cout << vocab.Lookup(*i) << ' ';
+ }
+ // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FakeOFStream.
+ std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
+ }
+}
diff --git a/lm/builder/initial_probabilities.cc b/lm/builder/initial_probabilities.cc
index f6ee334c7..5d19a8973 100644
--- a/lm/builder/initial_probabilities.cc
+++ b/lm/builder/initial_probabilities.cc
@@ -69,9 +69,12 @@ class PruneNGramStream {
block_->SetValidSize(dest_.Base() - block_base);
++block_;
StartBlock();
+ if (block_) {
+ currentCount_ = current_.CutoffCount();
+ }
+ } else {
+ currentCount_ = current_.CutoffCount();
}
-
- currentCount_ = current_.CutoffCount();
return *this;
}
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index db8537448..a7947a422 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -9,14 +9,66 @@
#include "util/murmur_hash.hh"
#include <assert.h>
+#include <math.h>
namespace lm { namespace builder {
namespace {
-class Callback {
+/* Calculate q, the collapsed probability and backoff, as defined in
+ * @inproceedings{Heafield-rest,
+ * author = {Kenneth Heafield and Philipp Koehn and Alon Lavie},
+ * title = {Language Model Rest Costs and Space-Efficient Storage},
+ * year = {2012},
+ * month = {July},
+ * booktitle = {Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
+ * address = {Jeju Island, Korea},
+ * pages = {1169--1178},
+ * url = {http://kheafield.com/professional/edinburgh/rest\_paper.pdf},
+ * }
+ * This is particularly convenient to calculate during interpolation because
+ * the needed backoff terms are already accessed at the same time.
+ */
+class OutputQ {
+ public:
+ explicit OutputQ(std::size_t order) : q_delta_(order) {}
+
+ void Gram(unsigned order_minus_1, float full_backoff, ProbBackoff &out) {
+ float &q_del = q_delta_[order_minus_1];
+ if (order_minus_1) {
+ // Divide by context's backoff (which comes in as out.backoff)
+ q_del = q_delta_[order_minus_1 - 1] / out.backoff * full_backoff;
+ } else {
+ q_del = full_backoff;
+ }
+ out.prob = log10f(out.prob * q_del);
+ // TODO: stop wastefully outputting this!
+ out.backoff = 0.0;
+ }
+
+ private:
+ // Product of backoffs in the numerator divided by backoffs in the
+ // denominator. Does not include
+ std::vector<float> q_delta_;
+};
+
+/* Default: output probability and backoff */
+class OutputProbBackoff {
+ public:
+ explicit OutputProbBackoff(std::size_t /*order*/) {}
+
+ void Gram(unsigned /*order_minus_1*/, float full_backoff, ProbBackoff &out) const {
+ // Correcting for numerical precision issues. Take that IRST.
+ out.prob = std::min(0.0f, log10f(out.prob));
+ out.backoff = log10f(full_backoff);
+ }
+};
+
+template <class Output> class Callback {
public:
Callback(float uniform_prob, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds)
- : backoffs_(backoffs.size()), probs_(backoffs.size() + 2), prune_thresholds_(prune_thresholds) {
+ : backoffs_(backoffs.size()), probs_(backoffs.size() + 2),
+ prune_thresholds_(prune_thresholds),
+ output_(backoffs.size() + 1 /* order */) {
probs_[0] = uniform_prob;
for (std::size_t i = 0; i < backoffs.size(); ++i) {
backoffs_.push_back(backoffs[i]);
@@ -25,6 +77,10 @@ class Callback {
~Callback() {
for (std::size_t i = 0; i < backoffs_.size(); ++i) {
+ if(prune_thresholds_[i + 1] > 0)
+ while(backoffs_[i])
+ ++backoffs_[i];
+
if (backoffs_[i]) {
std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl;
abort();
@@ -36,40 +92,34 @@ class Callback {
Payload &pay = gram.Value();
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
probs_[order_minus_1 + 1] = pay.complete.prob;
- pay.complete.prob = log10(pay.complete.prob);
-
- if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
- // This skips over ngrams if backoffs have been exhausted.
- if(!backoffs_[order_minus_1]) {
- pay.complete.backoff = 0.0;
- return;
- }
+ float out_backoff;
+ if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
if(prune_thresholds_[order_minus_1 + 1] > 0) {
//Compute hash value for current context
uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
- while(backoffs_[order_minus_1] && current_hash != hashed_backoff->hash_value) {
+ while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1])
hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
- ++backoffs_[order_minus_1];
- }
-
+
if(current_hash == hashed_backoff->hash_value) {
- pay.complete.backoff = log10(hashed_backoff->gamma);
+ out_backoff = hashed_backoff->gamma;
++backoffs_[order_minus_1];
} else {
// Has been pruned away so it is not a context anymore
- pay.complete.backoff = 0.0;
+ out_backoff = 1.0;
}
} else {
- pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
+ out_backoff = *static_cast<const float*>(backoffs_[order_minus_1].Get());
++backoffs_[order_minus_1];
}
} else {
// Not a context.
- pay.complete.backoff = 0.0;
+ out_backoff = 1.0;
}
+
+ output_.Gram(order_minus_1, out_backoff, pay.complete);
}
void Exit(unsigned, const NGram &) const {}
@@ -79,19 +129,29 @@ class Callback {
std::vector<float> probs_;
const std::vector<uint64_t>& prune_thresholds_;
+
+ Output output_;
};
} // namespace
-Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds)
+Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds, bool output_q)
: uniform_prob_(1.0 / static_cast<float>(vocab_size)), // Includes <unk> but excludes <s>.
backoffs_(backoffs),
- prune_thresholds_(prune_thresholds) {}
+ prune_thresholds_(prune_thresholds),
+ output_q_(output_q) {}
// perform order-wise interpolation
void Interpolate::Run(const util::stream::ChainPositions &positions) {
assert(positions.size() == backoffs_.size() + 1);
- Callback callback(uniform_prob_, backoffs_, prune_thresholds_);
- JointOrder<Callback, SuffixOrder>(positions, callback);
+ if (output_q_) {
+ typedef Callback<OutputQ> C;
+ C callback(uniform_prob_, backoffs_, prune_thresholds_);
+ JointOrder<C, SuffixOrder>(positions, callback);
+ } else {
+ typedef Callback<OutputProbBackoff> C;
+ C callback(uniform_prob_, backoffs_, prune_thresholds_);
+ JointOrder<C, SuffixOrder>(positions, callback);
+ }
}
}} // namespaces
diff --git a/lm/builder/interpolate.hh b/lm/builder/interpolate.hh
index 55a55428f..0acece926 100644
--- a/lm/builder/interpolate.hh
+++ b/lm/builder/interpolate.hh
@@ -18,7 +18,7 @@ class Interpolate {
public:
// Normally vocab_size is the unigram count-1 (since p(<s>) = 0) but might
// be larger when the user specifies a consistent vocabulary size.
- explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds);
+ explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool output_q_);
void Run(const util::stream::ChainPositions &positions);
@@ -26,6 +26,7 @@ class Interpolate {
float uniform_prob_;
util::stream::ChainPositions backoffs_;
const std::vector<uint64_t> prune_thresholds_;
+ bool output_q_;
};
}} // namespaces
diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc
index e1ae2d417..265dd2164 100644
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@@ -33,7 +33,6 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::s
// convert to vector of integers
std::vector<uint64_t> prune_thresholds;
prune_thresholds.reserve(order);
- std::cerr << "Pruning ";
for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
try {
prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
@@ -66,6 +65,18 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::s
return prune_thresholds;
}
+lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> &param) {
+ lm::builder::Discount ret;
+ UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+");
+ UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified");
+ ret.amount[0] = 0.0;
+ for (unsigned i = 0; i < 3; ++i) {
+ float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]);
+ UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
+ ret.amount[i + 1] = discount;
+ }
+ return ret;
+}
} // namespace
@@ -77,7 +88,11 @@ int main(int argc, char *argv[]) {
std::string text, arpa;
std::vector<std::string> pruning;
-
+ std::vector<std::string> discount_fallback;
+ std::vector<std::string> discount_fallback_default;
+ discount_fallback_default.push_back("0.5");
+ discount_fallback_default.push_back("1");
+ discount_fallback_default.push_back("1.5");
options.add_options()
("help,h", po::bool_switch(), "Show this help message")
@@ -86,7 +101,7 @@ int main(int argc, char *argv[]) {
->required()
#endif
, "Order of the model")
- ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)")
+ ("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI. If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
@@ -99,7 +114,9 @@ int main(int argc, char *argv[]) {
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
- ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Unigram pruning is not implemented, so the first value must be zero. Default is to not prune, which is equivalent to --prune 0.");
+ ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
+ ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Unigram pruning is not implemented, so the first value must be zero. Default is to not prune, which is equivalent to --prune 0.")
+ ("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, options), vm);
@@ -143,7 +160,7 @@ int main(int argc, char *argv[]) {
#endif
if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
- std::cerr << "--vocab_pad requires --interpolate_unigrams" << std::endl;
+ std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl;
return 1;
}
@@ -153,6 +170,15 @@ int main(int argc, char *argv[]) {
pipeline.disallowed_symbol_action = lm::THROW_UP;
}
+ if (vm.count("discount_fallback")) {
+ pipeline.discount.fallback = ParseDiscountFallback(discount_fallback);
+ pipeline.discount.bad_action = lm::COMPLAIN;
+ } else {
+ // Unused, just here to prevent the compiler from complaining about uninitialized.
+ pipeline.discount.fallback = lm::builder::Discount();
+ pipeline.discount.bad_action = lm::THROW_UP;
+ }
+
// parse pruning thresholds. These depend on order, so it is not done as a notifier.
pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);
diff --git a/lm/builder/pipeline.cc b/lm/builder/pipeline.cc
index da82c22e7..21064ab3a 100644
--- a/lm/builder/pipeline.cc
+++ b/lm/builder/pipeline.cc
@@ -280,7 +280,7 @@ void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &maste
gamma_chains.push_back(read_backoffs);
gamma_chains.back() >> gammas[i].Source();
}
- master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds);
+ master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.output_q);
gamma_chains >> util::stream::kRecycle;
master.BufferFinal(counts);
}
@@ -302,33 +302,40 @@ void Pipeline(PipelineConfig config, int text_file, int out_arpa) {
"Not enough memory to fit " << (config.order * config.block_count) << " blocks with minimum size " << config.minimum_block << ". Increase memory to " << (config.minimum_block * config.order * config.block_count) << " bytes or decrease the minimum block size.");
UTIL_TIMER("(%w s) Total wall time elapsed\n");
- Master master(config);
-
- util::scoped_fd vocab_file(config.vocab_file.empty() ?
- util::MakeTemp(config.TempPrefix()) :
- util::CreateOrThrow(config.vocab_file.c_str()));
- uint64_t token_count;
- std::string text_file_name;
- CountText(text_file, vocab_file.get(), master, token_count, text_file_name);
- std::vector<uint64_t> counts;
- std::vector<uint64_t> counts_pruned;
- std::vector<Discount> discounts;
- master >> AdjustCounts(counts, counts_pruned, discounts, config.prune_thresholds);
+ Master master(config);
+ // master's destructor will wait for chains. But they might be deadlocked if
+ // this thread dies because e.g. it ran out of memory.
+ try {
+ util::scoped_fd vocab_file(config.vocab_file.empty() ?
+ util::MakeTemp(config.TempPrefix()) :
+ util::CreateOrThrow(config.vocab_file.c_str()));
+ uint64_t token_count;
+ std::string text_file_name;
+ CountText(text_file, vocab_file.get(), master, token_count, text_file_name);
+
+ std::vector<uint64_t> counts;
+ std::vector<uint64_t> counts_pruned;
+ std::vector<Discount> discounts;
+ master >> AdjustCounts(config.prune_thresholds, counts, counts_pruned, config.discount, discounts);
+
+ {
+ util::FixedArray<util::stream::FileBuffer> gammas;
+ Sorts<SuffixOrder> primary;
+ InitialProbabilities(counts, counts_pruned, discounts, master, primary, gammas, config.prune_thresholds);
+ InterpolateProbabilities(counts_pruned, master, primary, gammas);
+ }
- {
- util::FixedArray<util::stream::FileBuffer> gammas;
- Sorts<SuffixOrder> primary;
- InitialProbabilities(counts, counts_pruned, discounts, master, primary, gammas, config.prune_thresholds);
- InterpolateProbabilities(counts_pruned, master, primary, gammas);
+ std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
+ VocabReconstitute vocab(vocab_file.get());
+ UTIL_THROW_IF(vocab.Size() != counts[0], util::Exception, "Vocab words don't match up. Is there a null byte in the input?");
+ HeaderInfo header_info(text_file_name, token_count);
+ master >> PrintARPA(vocab, counts_pruned, (config.verbose_header ? &header_info : NULL), out_arpa) >> util::stream::kRecycle;
+ master.MutableChains().Wait(true);
+ } catch (const util::Exception &e) {
+ std::cerr << e.what() << std::endl;
+ abort();
}
-
- std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
- VocabReconstitute vocab(vocab_file.get());
- UTIL_THROW_IF(vocab.Size() != counts[0], util::Exception, "Vocab words don't match up. Is there a null byte in the input?");
- HeaderInfo header_info(text_file_name, token_count);
- master >> PrintARPA(vocab, counts_pruned, (config.verbose_header ? &header_info : NULL), out_arpa) >> util::stream::kRecycle;
- master.MutableChains().Wait(true);
}
}} // namespaces
diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh
index 4395622ed..09e1a4d52 100644
--- a/lm/builder/pipeline.hh
+++ b/lm/builder/pipeline.hh
@@ -1,6 +1,7 @@
#ifndef LM_BUILDER_PIPELINE_H
#define LM_BUILDER_PIPELINE_H
+#include "lm/builder/adjust_counts.hh"
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/header_info.hh"
#include "lm/lm_exception.hh"
@@ -19,6 +20,8 @@ struct PipelineConfig {
util::stream::SortConfig sort;
InitialProbabilitiesConfig initial_probs;
util::stream::ChainConfig read_backoffs;
+
+ // Include a header in the ARPA with some statistics?
bool verbose_header;
// Estimated vocabulary size. Used for sizing CorpusCount memory and
@@ -34,6 +37,12 @@ struct PipelineConfig {
// n-gram count thresholds for pruning. 0 values means no pruning for
// corresponding n-gram order
std::vector<uint64_t> prune_thresholds; //mjd
+
+ // What to do with discount failures.
+ DiscountConfig discount;
+
+ // Compute collapsed q values instead of probability and backoff
+ bool output_q;
/* Computing the perplexity of LMs with different vocabularies is hard. For
* example, the lowest perplexity is attained by a unigram model that
diff --git a/lm/builder/print.cc b/lm/builder/print.cc
index c70e62ed6..aee6e1341 100644
--- a/lm/builder/print.cc
+++ b/lm/builder/print.cc
@@ -50,13 +50,12 @@ void PrintARPA::Run(const util::stream::ChainPositions &positions) {
out << "\\" << order << "-grams:" << '\n';
for (NGramStream stream(positions[order - 1]); stream; ++stream) {
// Correcting for numerical precision issues. Take that IRST.
- out << std::min(0.0f, stream->Value().complete.prob) << '\t' << vocab_.Lookup(*stream->begin());
+ out << stream->Value().complete.prob << '\t' << vocab_.Lookup(*stream->begin());
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
out << ' ' << vocab_.Lookup(*i);
}
- float backoff = stream->Value().complete.backoff;
- if (backoff != 0.0)
- out << '\t' << backoff;
+ if (order != positions.size())
+ out << '\t' << stream->Value().complete.backoff;
out << '\n';
}
diff --git a/lm/model_test.cc b/lm/model_test.cc
index 7005b05ea..0f54724bb 100644
--- a/lm/model_test.cc
+++ b/lm/model_test.cc
@@ -176,7 +176,7 @@ template <class M> void MinimalState(const M &model) {
AppendTest("to", 1, -1.687872, false);
AppendTest("look", 2, -0.2922095, true);
BOOST_CHECK_EQUAL(2, state.length);
- AppendTest("good", 3, -7, true);
+ AppendTest("a", 3, -7, true);
}
template <class M> void ExtendLeftTest(const M &model) {
diff --git a/lm/ngram_query.hh b/lm/ngram_query.hh
index 9e32d113a..5f330c5cc 100644
--- a/lm/ngram_query.hh
+++ b/lm/ngram_query.hh
@@ -36,7 +36,7 @@ struct FullPrint : public BasicPrint {
"Perplexity including OOVs:\t" << ppl_including_oov << "\n"
"Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n"
"OOVs:\t" << corpus_oov << "\n"
- "Tokenss:\t" << corpus_tokens << '\n'
+ "Tokens:\t" << corpus_tokens << '\n'
;
}
};
diff --git a/lm/read_arpa.hh b/lm/read_arpa.hh
index 213fe1caa..64eeef306 100644
--- a/lm/read_arpa.hh
+++ b/lm/read_arpa.hh
@@ -41,29 +41,24 @@ class PositiveProbWarn {
WarningAction action_;
};
-template <class Weights> StringPiece Read1Gram(util::FilePiece &f, Weights &weights, PositiveProbWarn &warn) {
+template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
try {
- weights.prob = f.ReadFloat();
- if (weights.prob > 0.0) {
- warn.Warn(weights.prob);
- weights.prob = 0.0;
+ float prob = f.ReadFloat();
+ if (prob > 0.0) {
+ warn.Warn(prob);
+ prob = 0.0;
}
UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
- StringPiece ret(f.ReadDelimited(kARPASpaces));
- ReadBackoff(f, weights);
- return ret;
+ WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces));
+ Weights &w = unigrams[word];
+ w.prob = prob;
+ ReadBackoff(f, w);
} catch(util::Exception &e) {
e << " in the 1-gram at byte " << f.Offset();
throw;
}
}
-template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
- Weights temp;
- WordIndex word = vocab.Insert(Read1Gram(f, temp, warn));
- unigrams[word] = temp;
-}
-
template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
ReadNGramHeader(f, 1);
for (std::size_t i = 0; i < count; ++i) {
@@ -81,7 +76,12 @@ template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePie
weights.prob = 0.0;
}
for (unsigned char i = 0; i < n; ++i, ++indices_out) {
- *indices_out = vocab.Index(f.ReadDelimited(kARPASpaces));
+ StringPiece word(f.ReadDelimited(kARPASpaces));
+ WordIndex index = vocab.Index(word);
+ *indices_out = index;
+ // Check for words mapped to <unk> that are not the string <unk>.
+ UTIL_THROW_IF(index == 0 /* mapped to <unk> */ && (word != StringPiece("<unk>", 5)) && (word != StringPiece("<UNK>", 5)),
+ FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears");
}
ReadBackoff(f, weights);
} catch(util::Exception &e) {
diff --git a/lm/test.arpa b/lm/test.arpa
index ef214eae3..c4d2e6df5 100644
--- a/lm/test.arpa
+++ b/lm/test.arpa
@@ -105,7 +105,7 @@ ngram 5=4
-0.04835128 looking on a -0.4771212
-3 also would consider -7
-6 <unk> however <unk> -12
--7 to look good
+-7 to look a
\4-grams:
-0.009249173 looking on a little -0.4771212
diff --git a/lm/test_nounk.arpa b/lm/test_nounk.arpa
index 060733d98..e38fc8547 100644
--- a/lm/test_nounk.arpa
+++ b/lm/test_nounk.arpa
@@ -101,7 +101,7 @@ ngram 5=4
-0.1892331 little more loin
-0.04835128 looking on a -0.4771212
-3 also would consider -7
--7 to look good
+-7 to look a
\4-grams:
-0.009249173 looking on a little -0.4771212
diff --git a/lm/trie_sort.cc b/lm/trie_sort.cc
index dc24e5b75..c3f468746 100644
--- a/lm/trie_sort.cc
+++ b/lm/trie_sort.cc
@@ -107,14 +107,20 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_pre
}
struct ThrowCombine {
- void operator()(std::size_t /*entry_size*/, const void * /*first*/, const void * /*second*/, FILE * /*out*/) const {
- UTIL_THROW(FormatLoadException, "Duplicate n-gram detected.");
+ void operator()(std::size_t entry_size, unsigned char order, const void *first, const void *second, FILE * /*out*/) const {
+ const WordIndex *base = reinterpret_cast<const WordIndex*>(first);
+ FormatLoadException e;
+ e << "Duplicate n-gram detected with vocab ids";
+ for (const WordIndex *i = base; i != base + order; ++i) {
+ e << ' ' << *i;
+ }
+ throw e;
}
};
// Useful for context files that just contain records with no value.
struct FirstCombine {
- void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const {
+ void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const {
util::WriteOrThrow(out, first, entry_size);
}
};
@@ -134,7 +140,7 @@ template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_f
util::WriteOrThrow(out_file.get(), second.Data(), entry_size);
++second;
} else {
- combine(entry_size, first.Data(), second.Data(), out_file.get());
+ combine(entry_size, order, first.Data(), second.Data(), out_file.get());
++first; ++second;
}
}
diff --git a/lm/wrappers/README b/lm/wrappers/README
new file mode 100644
index 000000000..56c34c23e
--- /dev/null
+++ b/lm/wrappers/README
@@ -0,0 +1,3 @@
+This directory is for wrappers around other people's LMs, presenting an interface similar to KenLM's. You will need to have their LM installed.
+
+NPLM is a work in progress.
diff --git a/lm/wrappers/nplm.cc b/lm/wrappers/nplm.cc
new file mode 100644
index 000000000..70622bd2b
--- /dev/null
+++ b/lm/wrappers/nplm.cc
@@ -0,0 +1,90 @@
+#include "lm/wrappers/nplm.hh"
+#include "util/exception.hh"
+#include "util/file.hh"
+
+#include <algorithm>
+
+#include <string.h>
+
+#include "neuralLM.h"
+
+namespace lm {
+namespace np {
+
+Vocabulary::Vocabulary(const nplm::vocabulary &vocab)
+ : base::Vocabulary(vocab.lookup_word("<s>"), vocab.lookup_word("</s>"), vocab.lookup_word("<unk>")),
+ vocab_(vocab), null_word_(vocab.lookup_word("<null>")) {}
+
+Vocabulary::~Vocabulary() {}
+
+WordIndex Vocabulary::Index(const std::string &str) const {
+ return vocab_.lookup_word(str);
+}
+
+bool Model::Recognize(const std::string &name) {
+ try {
+ util::scoped_fd file(util::OpenReadOrThrow(name.c_str()));
+ char magic_check[16];
+ util::ReadOrThrow(file.get(), magic_check, sizeof(magic_check));
+ const char nnlm_magic[] = "\\config\nversion ";
+ return !memcmp(magic_check, nnlm_magic, 16);
+ } catch (const util::Exception &) {
+ return false;
+ }
+}
+
+Model::Model(const std::string &file, std::size_t cache)
+ : base_instance_(new nplm::neuralLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
+ UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ". Change the defintion of NPLM_MAX_ORDER and recompile.");
+ // log10 compatible with backoff models.
+ base_instance_->set_log_base(10.0);
+ State begin_sentence, null_context;
+ std::fill(begin_sentence.words, begin_sentence.words + NPLM_MAX_ORDER - 1, base_instance_->lookup_word("<s>"));
+ null_word_ = base_instance_->lookup_word("<null>");
+ std::fill(null_context.words, null_context.words + NPLM_MAX_ORDER - 1, null_word_);
+
+ Init(begin_sentence, null_context, vocab_, base_instance_->get_order());
+}
+
+Model::~Model() {}
+
+FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const {
+ nplm::neuralLM *lm = backend_.get();
+ if (!lm) {
+ lm = new nplm::neuralLM(*base_instance_);
+ backend_.reset(lm);
+ lm->set_cache(cache_size_);
+ }
+ // State is in natural word order.
+ FullScoreReturn ret;
+ for (int i = 0; i < lm->get_order() - 1; ++i) {
+ lm->staging_ngram()(i) = from.words[i];
+ }
+ lm->staging_ngram()(lm->get_order() - 1) = new_word;
+ ret.prob = lm->lookup_from_staging();
+ // Always say full order.
+ ret.ngram_length = lm->get_order();
+ // Shift everything down by one.
+ memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (lm->get_order() - 2));
+ out_state.words[lm->get_order() - 2] = new_word;
+ // Fill in trailing words with zeros so state comparison works.
+ memset(out_state.words + lm->get_order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - lm->get_order()));
+ return ret;
+}
+
+// TODO: optimize with direct call?
+FullScoreReturn Model::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const {
+ // State is in natural word order. The API here specifies reverse order.
+ std::size_t state_length = std::min<std::size_t>(Order() - 1, context_rend - context_rbegin);
+ State state;
+ // Pad with null words.
+ for (lm::WordIndex *i = state.words; i < state.words + Order() - 1 - state_length; ++i) {
+ *i = null_word_;
+ }
+ // Put new words at the end.
+ std::reverse_copy(context_rbegin, context_rbegin + state_length, state.words + Order() - 1 - state_length);
+ return FullScore(state, new_word, out_state);
+}
+
+} // namespace np
+} // namespace lm
diff --git a/lm/wrappers/nplm.hh b/lm/wrappers/nplm.hh
new file mode 100644
index 000000000..b7dd4a21e
--- /dev/null
+++ b/lm/wrappers/nplm.hh
@@ -0,0 +1,83 @@
+#ifndef LM_WRAPPERS_NPLM_H
+#define LM_WRAPPERS_NPLM_H
+
+#include "lm/facade.hh"
+#include "lm/max_order.hh"
+#include "util/string_piece.hh"
+
+#include <boost/thread/tss.hpp>
+#include <boost/scoped_ptr.hpp>
+
+/* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang
+ * and Victoria Fossum."
+ * http://nlg.isi.edu/software/nplm/
+ */
+
+namespace nplm {
+class vocabulary;
+class neuralLM;
+} // namespace nplm
+
+namespace lm {
+namespace np {
+
+class Vocabulary : public base::Vocabulary {
+ public:
+ Vocabulary(const nplm::vocabulary &vocab);
+
+ ~Vocabulary();
+
+ WordIndex Index(const std::string &str) const;
+
+ // TODO: lobby them to support StringPiece
+ WordIndex Index(const StringPiece &str) const {
+ return Index(std::string(str.data(), str.size()));
+ }
+
+ lm::WordIndex NullWord() const { return null_word_; }
+
+ private:
+ const nplm::vocabulary &vocab_;
+
+ const lm::WordIndex null_word_;
+};
+
+// Sorry for imposing my limitations on your code.
+#define NPLM_MAX_ORDER 7
+
+struct State {
+ WordIndex words[NPLM_MAX_ORDER - 1];
+};
+
+class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
+ private:
+ typedef lm::base::ModelFacade<Model, State, Vocabulary> P;
+
+ public:
+ // Does this look like an NPLM?
+ static bool Recognize(const std::string &file);
+
+ explicit Model(const std::string &file, std::size_t cache_size = 1 << 20);
+
+ ~Model();
+
+ FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const;
+
+ FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
+
+ private:
+ boost::scoped_ptr<nplm::neuralLM> base_instance_;
+
+ mutable boost::thread_specific_ptr<nplm::neuralLM> backend_;
+
+ Vocabulary vocab_;
+
+ lm::WordIndex null_word_;
+
+ const std::size_t cache_size_;
+};
+
+} // namespace np
+} // namespace lm
+
+#endif // LM_WRAPPERS_NPLM_H
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp
index 467855d9b..04f4d75c9 100644
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@@ -186,7 +186,7 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
entry.set(stats);
}
-statscore_t BleuScorer::calculateScore(const vector<int>& comps) const
+statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) const
{
UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
@@ -266,12 +266,12 @@ float smoothedSentenceBleu
float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg)
{
// Sum sent and background
- std::vector<float> stats;
UTIL_THROW_IF(sent.size()!=bg.size(), util::Exception, "Error");
UTIL_THROW_IF(sent.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
+ std::vector<float> stats(sent.size());
for(size_t i=0; i<sent.size(); i++)
- stats.push_back(sent[i]+bg[i]);
+ stats[i] = sent[i]+bg[i];
// Calculate BLEU
float logbleu = 0.0;
@@ -289,23 +289,6 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
return exp(logbleu) * stats[kBleuNgramOrder*2];
}
-float unsmoothedBleu(const std::vector<float>& stats)
-{
- UTIL_THROW_IF(stats.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
-
- float logbleu = 0.0;
- for (int j = 0; j < kBleuNgramOrder; j++) {
- logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
- }
- logbleu /= kBleuNgramOrder;
- const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1];
-
- if (brevity < 0.0) {
- logbleu += brevity;
- }
- return exp(logbleu);
-}
-
vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile)
{
vector<string> scoreFiles;
diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h
index 8be567574..0594c8dca 100644
--- a/mert/BleuScorer.h
+++ b/mert/BleuScorer.h
@@ -13,7 +13,7 @@
namespace MosesTuning
{
-const int kBleuNgramOrder = 4;
+const size_t kBleuNgramOrder = 4;
class NgramCounts;
class Reference;
@@ -37,7 +37,7 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
- virtual statscore_t calculateScore(const std::vector<int>& comps) const;
+ virtual statscore_t calculateScore(const std::vector<ScoreStatsType>& comps) const;
virtual std::size_t NumberOfScores() const {
return 2 * kBleuNgramOrder + 1;
}
@@ -55,6 +55,10 @@ public:
return m_references.get();
}
+ virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const {
+ return totals[kBleuNgramOrder*2];
+ }
+
/**
* Count the ngrams of each type, up to the given length in the input line.
*/
@@ -93,11 +97,6 @@ float smoothedSentenceBleu
*/
float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg);
-/**
- * Computes plain old BLEU from a vector of stats
- */
-float unsmoothedBleu(const std::vector<float>& stats);
-
}
#endif // MERT_BLEU_SCORER_H_
diff --git a/mert/BleuScorerTest.cpp b/mert/BleuScorerTest.cpp
index a63196a3b..e223c25b1 100644
--- a/mert/BleuScorerTest.cpp
+++ b/mert/BleuScorerTest.cpp
@@ -235,7 +235,7 @@ BOOST_AUTO_TEST_CASE(bleu_clipped_counts)
BOOST_AUTO_TEST_CASE(calculate_actual_score)
{
BOOST_REQUIRE(4 == kBleuNgramOrder);
- std::vector<int> stats(2 * kBleuNgramOrder + 1);
+ std::vector<ScoreStatsType> stats(2 * kBleuNgramOrder + 1);
BleuScorer scorer;
// unigram
diff --git a/mert/CderScorer.cpp b/mert/CderScorer.cpp
index f06700abf..4479e0ad8 100644
--- a/mert/CderScorer.cpp
+++ b/mert/CderScorer.cpp
@@ -52,18 +52,18 @@ void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
string sentence = this->preprocessSentence(text);
- vector<int> stats;
+ vector<ScoreStatsType> stats;
prepareStatsVector(sid, sentence, stats);
entry.set(stats);
}
-void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>& stats)
+void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<ScoreStatsType>& stats)
{
sent_t cand;
TokenizeAndEncode(text, cand);
float max = -2;
- vector<int> tmp;
+ vector<ScoreStatsType> tmp;
for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
const sent_t& ref = m_ref_sentences[rid][sid];
tmp.clear();
@@ -79,7 +79,7 @@ void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>&
}
}
-float CderScorer::calculateScore(const vector<int>& comps) const
+float CderScorer::calculateScore(const vector<ScoreStatsType>& comps) const
{
if (comps.size() != 2) {
throw runtime_error("Size of stat vector for CDER is not 2");
@@ -89,7 +89,7 @@ float CderScorer::calculateScore(const vector<int>& comps) const
}
void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
- vector<int>& stats) const
+ vector<ScoreStatsType>& stats) const
{
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
int L = ref.size() + 1; // Number of inter-words positions in reference sentence
diff --git a/mert/CderScorer.h b/mert/CderScorer.h
index bd43ec0d8..68fa81857 100644
--- a/mert/CderScorer.h
+++ b/mert/CderScorer.h
@@ -23,13 +23,13 @@ public:
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
- virtual void prepareStatsVector(std::size_t sid, const std::string& text, std::vector<int>& stats);
+ virtual void prepareStatsVector(std::size_t sid, const std::string& text, std::vector<ScoreStatsType>& stats);
virtual std::size_t NumberOfScores() const {
return 2;
}
- virtual float calculateScore(const std::vector<int>& comps) const;
+ virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const;
private:
bool m_allowed_long_jumps;
@@ -38,7 +38,7 @@ private:
std::vector<std::vector<sent_t> > m_ref_sentences;
void computeCD(const sent_t& cand, const sent_t& ref,
- std::vector<int>& stats) const;
+ std::vector<ScoreStatsType>& stats) const;
// no copying allowed
CderScorer(const CderScorer&);
diff --git a/mert/Data.cpp b/mert/Data.cpp
index b93c3b6c0..49c1239e5 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -135,7 +135,7 @@ void Data::load(const std::string &featfile, const std::string &scorefile)
m_score_data->load(scorefile);
}
-void Data::loadNBest(const string &file)
+void Data::loadNBest(const string &file, bool oneBest)
{
TRACE_ERR("loading nbest from " << file << endl);
util::FilePiece in(file.c_str());
@@ -154,6 +154,7 @@ void Data::loadNBest(const string &file)
util::TokenIter<util::MultiCharacter> it(line, util::MultiCharacter("|||"));
sentence_index = ParseInt(*it);
+ if (oneBest && m_score_data->exists(sentence_index)) continue;
++it;
sentence = it->as_string();
++it;
@@ -164,10 +165,9 @@ void Data::loadNBest(const string &file)
++it; // skip model score.
if (it) {
- ++it;
alignment = it->as_string(); //fifth field (if present) is either phrase or word alignment
+ ++it;
if (it) {
- ++it;
alignment = it->as_string(); //sixth field (if present) is word alignment
}
}
diff --git a/mert/Data.h b/mert/Data.h
index cd090bad3..8bab23d63 100644
--- a/mert/Data.h
+++ b/mert/Data.h
@@ -67,7 +67,7 @@ public:
m_feature_data->Features(f);
}
- void loadNBest(const std::string &file);
+ void loadNBest(const std::string &file, bool oneBest=false);
void load(const std::string &featfile, const std::string &scorefile);
diff --git a/mert/FeatureData.h b/mert/FeatureData.h
index 2510b3aee..6e79529d4 100644
--- a/mert/FeatureData.h
+++ b/mert/FeatureData.h
@@ -12,6 +12,7 @@
#include <vector>
#include <iostream>
#include <stdexcept>
+#include <boost/lexical_cast.hpp>
#include "FeatureArray.h"
namespace MosesTuning
@@ -103,7 +104,7 @@ public:
inline int getName(std::size_t idx) const {
idx2name::const_iterator i = m_index_to_array_name.find(idx);
if (i != m_index_to_array_name.end())
- throw std::runtime_error("there is no entry at index " + idx);
+ throw std::runtime_error("there is no entry at index " + boost::lexical_cast<std::string>(idx));
return i->second;
}
@@ -116,7 +117,7 @@ public:
throw std::runtime_error("Error: you required an too big index");
std::map<std::size_t, std::string>::const_iterator it = m_index_to_feature_name.find(idx);
if (it == m_index_to_feature_name.end()) {
- throw std::runtime_error("Error: specified id is unknown: " + idx);
+ throw std::runtime_error("Error: specified id is unknown: " + boost::lexical_cast<std::string>(idx));
} else {
return it->second;
}
diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp
index 5a12be70a..a0c6a6ebc 100644
--- a/mert/FeatureStats.cpp
+++ b/mert/FeatureStats.cpp
@@ -14,6 +14,8 @@
#include <boost/functional/hash.hpp>
+#include "util/murmur_hash.hh"
+
#include "Util.h"
using namespace std;
@@ -59,6 +61,11 @@ void SparseVector::set(const string& name, FeatureStatsType value)
m_fvector[id] = value;
}
+void SparseVector::set(size_t id, FeatureStatsType value) {
+ assert(m_id_to_name.size() > id);
+ m_fvector[id] = value;
+}
+
void SparseVector::write(ostream& out, const string& sep) const
{
for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
@@ -91,6 +98,16 @@ void SparseVector::load(const string& file)
}
}
+SparseVector& SparseVector::operator+=(const SparseVector& rhs)
+{
+
+ for (fvector_t::const_iterator i = rhs.m_fvector.begin();
+ i != rhs.m_fvector.end(); ++i) {
+ m_fvector[i->first] = get(i->first) + (i->second);
+ }
+ return *this;
+}
+
SparseVector& SparseVector::operator-=(const SparseVector& rhs)
{
@@ -162,12 +179,18 @@ bool operator==(SparseVector const& item1, SparseVector const& item2)
return item1.m_fvector==item2.m_fvector;
}
+
std::size_t hash_value(SparseVector const& item)
{
- boost::hash<SparseVector::fvector_t> hasher;
- return hasher(item.m_fvector);
+ size_t seed = 0;
+ for (SparseVector::fvector_t::const_iterator i = item.m_fvector.begin(); i != item.m_fvector.end(); ++i) {
+ seed = util::MurmurHashNative(&(i->first), sizeof(i->first), seed);
+ seed = util::MurmurHashNative(&(i->second), sizeof(i->second), seed);
+ }
+ return seed;
}
+
FeatureStats::FeatureStats()
: m_available_size(kAvailableSize), m_entries(0),
m_array(new FeatureStatsType[m_available_size]) {}
@@ -181,8 +204,7 @@ FeatureStats::FeatureStats(const size_t size)
FeatureStats::~FeatureStats()
{
- delete [] m_array;
- m_array = NULL;
+ delete [] m_array;
}
void FeatureStats::Copy(const FeatureStats &stats)
diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h
index a882e7358..f989d9418 100644
--- a/mert/FeatureStats.h
+++ b/mert/FeatureStats.h
@@ -14,6 +14,9 @@
#include <map>
#include <string>
#include <vector>
+
+#include <boost/unordered_map.hpp>
+#include "util/string_piece.hh"
#include "Types.h"
namespace MosesTuning
@@ -31,6 +34,7 @@ public:
FeatureStatsType get(const std::string& name) const;
FeatureStatsType get(std::size_t id) const;
void set(const std::string& name, FeatureStatsType value);
+ void set(size_t id, FeatureStatsType value);
void clear();
void load(const std::string& file);
std::size_t size() const {
@@ -40,6 +44,7 @@ public:
void write(std::ostream& out, const std::string& sep = " ") const;
SparseVector& operator-=(const SparseVector& rhs);
+ SparseVector& operator+=(const SparseVector& rhs);
FeatureStatsType inner_product(const SparseVector& rhs) const;
// Added by cherryc
diff --git a/mert/ForestRescore.cpp b/mert/ForestRescore.cpp
new file mode 100644
index 000000000..0172c6d92
--- /dev/null
+++ b/mert/ForestRescore.cpp
@@ -0,0 +1,432 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include <cmath>
+#include <limits>
+#include <list>
+
+#include <boost/unordered_set.hpp>
+
+#include "util/file_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "BleuScorer.h"
+#include "ForestRescore.h"
+
+using namespace std;
+
+namespace MosesTuning {
+
+std::ostream& operator<<(std::ostream& out, const WordVec& wordVec) {
+ out << "[";
+ for (size_t i = 0; i < wordVec.size(); ++i) {
+ out << wordVec[i]->first;
+ if (i+1< wordVec.size()) out << " ";
+ }
+ out << "]";
+ return out;
+}
+
+
+void ReferenceSet::Load(const vector<string>& files, Vocab& vocab) {
+ for (size_t i = 0; i < files.size(); ++i) {
+ util::FilePiece fh(files[i].c_str());
+ size_t sentenceId = 0;
+ while(true) {
+ StringPiece line;
+ try {
+ line = fh.ReadLine();
+ } catch (util::EndOfFileException &e) {
+ break;
+ }
+ AddLine(sentenceId, line, vocab);
+ ++sentenceId;
+ }
+ }
+
+}
+
+void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab) {
+ //cerr << line << endl;
+ NgramCounter ngramCounts;
+ list<WordVec> openNgrams;
+ size_t length = 0;
+ //tokenize & count
+ for (util::TokenIter<util::SingleCharacter, true> j(line, util::SingleCharacter(' ')); j; ++j) {
+ const Vocab::Entry* nextTok = &(vocab.FindOrAdd(*j));
+ ++length;
+ openNgrams.push_front(WordVec());
+ for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) {
+ k->push_back(nextTok);
+ ++ngramCounts[*k];
+ }
+ if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back();
+ }
+
+ //merge into overall ngram map
+ for (NgramCounter::const_iterator ni = ngramCounts.begin();
+ ni != ngramCounts.end(); ++ni) {
+ size_t count = ni->second;
+ //cerr << *ni << " " << count << endl;
+ if (ngramCounts_.size() <= sentenceId) ngramCounts_.resize(sentenceId+1);
+ NgramMap::iterator totalsIter = ngramCounts_[sentenceId].find(ni->first);
+ if (totalsIter == ngramCounts_[sentenceId].end()) {
+ ngramCounts_[sentenceId][ni->first] = pair<size_t,size_t>(count,count);
+ } else {
+ ngramCounts_[sentenceId][ni->first].first = max(count, ngramCounts_[sentenceId][ni->first].first); //clip
+ ngramCounts_[sentenceId][ni->first].second += count; //no clip
+ }
+ }
+ //length
+ if (lengths_.size() <= sentenceId) lengths_.resize(sentenceId+1);
+ //TODO - length strategy - this is MIN
+ if (!lengths_[sentenceId]) {
+ lengths_[sentenceId] = length;
+ } else {
+ lengths_[sentenceId] = min(length,lengths_[sentenceId]);
+ }
+ //cerr << endl;
+
+}
+
+size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool clip) const {
+ const NgramMap& ngramCounts = ngramCounts_.at(sentenceId);
+ NgramMap::const_iterator ngi = ngramCounts.find(ngram);
+ if (ngi == ngramCounts.end()) return 0;
+ return clip ? ngi->second.first : ngi->second.second;
+}
+
+VertexState::VertexState(): bleuStats(kBleuNgramOrder), targetLength(0) {}
+
+void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStatsType>& bleuStats ) const {
+ for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) {
+ //cerr << "Checking: " << *ngi << " matches " << references_.NgramMatches(sentenceId_,*ngi,false) << endl;
+ size_t order = ngi->first.size();
+ size_t count = ngi->second;
+ bleuStats[(order-1)*2 + 1] += count;
+ bleuStats[(order-1) * 2] += min(count, references_.NgramMatches(sentenceId_,ngi->first,false));
+ }
+}
+
+size_t HgBleuScorer::GetTargetLength(const Edge& edge) const {
+ size_t targetLength = 0;
+ for (size_t i = 0; i < edge.Words().size(); ++i) {
+ const Vocab::Entry* word = edge.Words()[i];
+ if (word) ++targetLength;
+ }
+ for (size_t i = 0; i < edge.Children().size(); ++i) {
+ const VertexState& state = vertexStates_[edge.Children()[i]];
+ targetLength += state.targetLength;
+ }
+ return targetLength;
+}
+
+FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vector<FeatureStatsType>& bleuStats) {
+ NgramCounter ngramCounts;
+ size_t childId = 0;
+ size_t wordId = 0;
+ size_t contextId = 0; //position within left or right context
+ const VertexState* vertexState = NULL;
+ bool inLeftContext = false;
+ bool inRightContext = false;
+ list<WordVec> openNgrams;
+ const Vocab::Entry* currentWord = NULL;
+ while (wordId < edge.Words().size()) {
+ currentWord = edge.Words()[wordId];
+ if (currentWord != NULL) {
+ ++wordId;
+ } else {
+ if (!inLeftContext && !inRightContext) {
+ //entering a vertex
+ assert(!vertexState);
+ vertexState = &(vertexStates_[edge.Children()[childId]]);
+ ++childId;
+ if (vertexState->leftContext.size()) {
+ inLeftContext = true;
+ contextId = 0;
+ currentWord = vertexState->leftContext[contextId];
+ } else {
+ //empty context
+ vertexState = NULL;
+ ++wordId;
+ continue;
+ }
+ } else {
+ //already in a vertex
+ ++contextId;
+ if (inLeftContext && contextId < vertexState->leftContext.size()) {
+ //still in left context
+ currentWord = vertexState->leftContext[contextId];
+ } else if (inLeftContext) {
+ //at end of left context
+ if (vertexState->leftContext.size() == kBleuNgramOrder-1) {
+ //full size context, jump to right state
+ openNgrams.clear();
+ inLeftContext = false;
+ inRightContext = true;
+ contextId = 0;
+ currentWord = vertexState->rightContext[contextId];
+ } else {
+ //short context, just ignore right context
+ inLeftContext = false;
+ vertexState = NULL;
+ ++wordId;
+ continue;
+ }
+ } else {
+ //in right context
+ if (contextId < vertexState->rightContext.size()) {
+ currentWord = vertexState->rightContext[contextId];
+ } else {
+ //leaving vertex
+ inRightContext = false;
+ vertexState = NULL;
+ ++wordId;
+ continue;
+ }
+ }
+ }
+ }
+ assert(currentWord);
+ if (graph_.IsBoundary(currentWord)) continue;
+ openNgrams.push_front(WordVec());
+ openNgrams.front().reserve(kBleuNgramOrder);
+ for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) {
+ k->push_back(currentWord);
+ //Only insert ngrams that cross boundaries
+ if (!vertexState || (inLeftContext && k->size() > contextId+1)) ++ngramCounts[*k];
+ }
+ if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back();
+ }
+
+ //Collect matches
+ //This edge
+ //cerr << "edge ngrams" << endl;
+ UpdateMatches(ngramCounts, bleuStats);
+
+ //Child vertexes
+ for (size_t i = 0; i < edge.Children().size(); ++i) {
+ //cerr << "vertex ngrams " << edge.Children()[i] << endl;
+ for (size_t j = 0; j < bleuStats.size(); ++j) {
+ bleuStats[j] += vertexStates_[edge.Children()[i]].bleuStats[j];
+ }
+ }
+
+
+ FeatureStatsType sourceLength = head.SourceCovered();
+ size_t referenceLength = references_.Length(sentenceId_);
+ FeatureStatsType effectiveReferenceLength =
+ sourceLength / totalSourceLength_ * referenceLength;
+
+ bleuStats[bleuStats.size()-1] = effectiveReferenceLength;
+ //backgroundBleu_[backgroundBleu_.size()-1] =
+ // backgroundRefLength_ * sourceLength / totalSourceLength_;
+ FeatureStatsType bleu = sentenceLevelBackgroundBleu(bleuStats, backgroundBleu_);
+
+ return bleu;
+}
+
+void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const vector<FeatureStatsType>& bleuStats) {
+ //TODO: Maybe more efficient to absorb into the Score() method
+ VertexState& vertexState = vertexStates_[vertexId];
+ //cerr << "Updating state for " << vertexId << endl;
+
+ //leftContext
+ int wi = 0;
+ const VertexState* childState = NULL;
+ int contexti = 0; //index within child context
+ int childi = 0;
+ while (vertexState.leftContext.size() < (kBleuNgramOrder-1)) {
+ if ((size_t)wi >= winnerEdge.Words().size()) break;
+ const Vocab::Entry* word = winnerEdge.Words()[wi];
+ if (word != NULL) {
+ vertexState.leftContext.push_back(word);
+ ++wi;
+ } else {
+ if (childState == NULL) {
+ //start of child state
+ childState = &(vertexStates_[winnerEdge.Children()[childi++]]);
+ contexti = 0;
+ }
+ if ((size_t)contexti < childState->leftContext.size()) {
+ vertexState.leftContext.push_back(childState->leftContext[contexti++]);
+ } else {
+ //end of child context
+ childState = NULL;
+ ++wi;
+ }
+ }
+ }
+
+ //rightContext
+ wi = winnerEdge.Words().size() - 1;
+ childState = NULL;
+ childi = winnerEdge.Children().size() - 1;
+ while (vertexState.rightContext.size() < (kBleuNgramOrder-1)) {
+ if (wi < 0) break;
+ const Vocab::Entry* word = winnerEdge.Words()[wi];
+ if (word != NULL) {
+ vertexState.rightContext.push_back(word);
+ --wi;
+ } else {
+ if (childState == NULL) {
+ //start (ie rhs) of child state
+ childState = &(vertexStates_[winnerEdge.Children()[childi--]]);
+ contexti = childState->rightContext.size()-1;
+ }
+ if (contexti >= 0) {
+ vertexState.rightContext.push_back(childState->rightContext[contexti--]);
+ } else {
+ //end (ie lhs) of child context
+ childState = NULL;
+ --wi;
+ }
+ }
+ }
+ reverse(vertexState.rightContext.begin(), vertexState.rightContext.end());
+
+ //length + counts
+ vertexState.targetLength = GetTargetLength(winnerEdge);
+ vertexState.bleuStats = bleuStats;
+}
+
+
+typedef pair<const Edge*,FeatureStatsType> BackPointer;
+
+
+/**
+ * Recurse through back pointers
+ **/
+static void GetBestHypothesis(size_t vertexId, const Graph& graph, const vector<BackPointer>& bps,
+ HgHypothesis* bestHypo) {
+ //cerr << "Expanding " << vertexId << " Score: " << bps[vertexId].second << endl;
+ //UTIL_THROW_IF(bps[vertexId].second == kMinScore+1, HypergraphException, "Landed at vertex " << vertexId << " which is a dead end");
+ if (!bps[vertexId].first) return;
+ const Edge* prevEdge = bps[vertexId].first;
+ bestHypo->featureVector += *(prevEdge->Features().get());
+ size_t childId = 0;
+ for (size_t i = 0; i < prevEdge->Words().size(); ++i) {
+ if (prevEdge->Words()[i] != NULL) {
+ bestHypo->text.push_back(prevEdge->Words()[i]);
+ } else {
+ size_t childVertexId = prevEdge->Children()[childId++];
+ HgHypothesis childHypo;
+ GetBestHypothesis(childVertexId,graph,bps,&childHypo);
+ bestHypo->text.insert(bestHypo->text.end(), childHypo.text.begin(), childHypo.text.end());
+ bestHypo->featureVector += childHypo.featureVector;
+ }
+ }
+}
+
+void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references , size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo)
+{
+ BackPointer init(NULL,kMinScore);
+ vector<BackPointer> backPointers(graph.VertexSize(),init);
+ HgBleuScorer bleuScorer(references, graph, sentenceId, backgroundBleu);
+ vector<FeatureStatsType> winnerStats(kBleuNgramOrder*2+1);
+ for (size_t vi = 0; vi < graph.VertexSize(); ++vi) {
+ //cerr << "vertex id " << vi << endl;
+ FeatureStatsType winnerScore = kMinScore;
+ const Vertex& vertex = graph.GetVertex(vi);
+ const vector<const Edge*>& incoming = vertex.GetIncoming();
+ if (!incoming.size()) {
+ //UTIL_THROW(HypergraphException, "Vertex " << vi << " has no incoming edges");
+ //If no incoming edges, vertex is a dead end
+ backPointers[vi].first = NULL;
+ backPointers[vi].second = kMinScore/2;
+ } else {
+ //cerr << "\nVertex: " << vi << endl;
+ for (size_t ei = 0; ei < incoming.size(); ++ei) {
+ //cerr << "edge id " << ei << endl;
+ FeatureStatsType incomingScore = incoming[ei]->GetScore(weights);
+ for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) {
+ size_t childId = incoming[ei]->Children()[i];
+ UTIL_THROW_IF(backPointers[childId].second == kMinScore,
+ HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId);
+ incomingScore += backPointers[childId].second;
+ }
+ vector<FeatureStatsType> bleuStats(kBleuNgramOrder*2+1);
+ // cerr << "Score: " << incomingScore << " Bleu: ";
+ // if (incomingScore > nonbleuscore) {nonbleuscore = incomingScore; nonbleuid = ei;}
+ FeatureStatsType totalScore = incomingScore;
+ if (bleuWeight) {
+ FeatureStatsType bleuScore = bleuScorer.Score(*(incoming[ei]), vertex, bleuStats);
+ if (isnan(bleuScore)) {
+ cerr << "WARN: bleu score undefined" << endl;
+ cerr << "\tVertex id : " << vi << endl;
+ cerr << "\tBleu stats : ";
+ for (size_t i = 0; i < bleuStats.size(); ++i) {
+ cerr << bleuStats[i] << ",";
+ }
+ cerr << endl;
+ bleuScore = 0;
+ }
+ //UTIL_THROW_IF(isnan(bleuScore), util::Exception, "Bleu score undefined, smoothing problem?");
+ totalScore += bleuWeight * bleuScore;
+ // cerr << bleuScore << " Total: " << incomingScore << endl << endl;
+ //cerr << "is " << incomingScore << " bs " << bleuScore << endl;
+ }
+ if (totalScore >= winnerScore) {
+ //We only store the feature score (not the bleu score) with the vertex,
+ //since the bleu score is always cumulative, ie from counts for the whole span.
+ winnerScore = totalScore;
+ backPointers[vi].first = incoming[ei];
+ backPointers[vi].second = incomingScore;
+ winnerStats = bleuStats;
+ }
+ }
+ //update with winner
+ //if (bleuWeight) {
+ //TODO: Not sure if we need this when computing max-model solution
+ bleuScorer.UpdateState(*(backPointers[vi].first), vi, winnerStats);
+
+ }
+ }
+
+ //expand back pointers
+ GetBestHypothesis(graph.VertexSize()-1, graph, backPointers, bestHypo);
+
+ //bleu stats and fv
+
+ //Need the actual (clipped) stats
+ //TODO: This repeats code in bleu scorer - factor out
+ bestHypo->bleuStats.resize(kBleuNgramOrder*2+1);
+ NgramCounter counts;
+ list<WordVec> openNgrams;
+ for (size_t i = 0; i < bestHypo->text.size(); ++i) {
+ const Vocab::Entry* entry = bestHypo->text[i];
+ if (graph.IsBoundary(entry)) continue;
+ openNgrams.push_front(WordVec());
+ for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) {
+ k->push_back(entry);
+ ++counts[*k];
+ }
+ if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back();
+ }
+ for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) {
+ size_t order = ngi->first.size();
+ size_t count = ngi->second;
+ bestHypo->bleuStats[(order-1)*2 + 1] += count;
+ bestHypo->bleuStats[(order-1) * 2] += min(count, references.NgramMatches(sentenceId,ngi->first,true));
+ }
+ bestHypo->bleuStats[kBleuNgramOrder*2] = references.Length(sentenceId);
+}
+
+
+};
diff --git a/mert/ForestRescore.h b/mert/ForestRescore.h
new file mode 100644
index 000000000..900275b74
--- /dev/null
+++ b/mert/ForestRescore.h
@@ -0,0 +1,120 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+#ifndef MERT_FOREST_RESCORE_H
+#define MERT_FOREST_RESCORE_H
+
+#include <valarray>
+#include <vector>
+
+#include <boost/unordered_set.hpp>
+
+#include "BleuScorer.h"
+#include "Hypergraph.h"
+
+namespace MosesTuning {
+
+std::ostream& operator<<(std::ostream& out, const WordVec& wordVec);
+
+struct NgramHash : public std::unary_function<const WordVec&, std::size_t> {
+ std::size_t operator()(const WordVec& ngram) const {
+ return util::MurmurHashNative(&(ngram[0]), ngram.size() * sizeof(WordVec::value_type));
+ }
+};
+
+struct NgramEquals : public std::binary_function<const WordVec&, const WordVec&, bool> {
+ bool operator()(const WordVec& first, const WordVec& second) const {
+ if (first.size() != second.size()) return false;
+ return memcmp(&(first[0]), &(second[0]), first.size() * sizeof(WordVec::value_type)) == 0;
+ }
+};
+
+typedef boost::unordered_map<WordVec, size_t, NgramHash, NgramEquals> NgramCounter;
+
+
+class ReferenceSet {
+
+
+public:
+
+ void AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab);
+
+ void Load(const std::vector<std::string>& files, Vocab& vocab);
+
+ size_t NgramMatches(size_t sentenceId, const WordVec&, bool clip) const;
+
+ size_t Length(size_t sentenceId) const {return lengths_[sentenceId];}
+
+private:
+ //ngrams to (clipped,unclipped) counts
+ typedef boost::unordered_map<WordVec, std::pair<std::size_t,std::size_t>, NgramHash,NgramEquals> NgramMap;
+ std::vector<NgramMap> ngramCounts_;
+ std::vector<size_t> lengths_;
+
+};
+
+struct VertexState {
+ VertexState();
+
+ std::vector<FeatureStatsType> bleuStats;
+ WordVec leftContext;
+ WordVec rightContext;
+ size_t targetLength;
+};
+
+/**
+ * Used to score an rule (ie edge) when we are applying it.
+**/
+class HgBleuScorer {
+ public:
+ HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu):
+ references_(references), sentenceId_(sentenceId), graph_(graph), backgroundBleu_(backgroundBleu),
+ backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) {
+ vertexStates_.resize(graph.VertexSize());
+ totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered();
+ }
+
+ FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ;
+
+ void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats);
+
+
+ private:
+ const ReferenceSet& references_;
+ std::vector<VertexState> vertexStates_;
+ size_t sentenceId_;
+ size_t totalSourceLength_;
+ const Graph& graph_;
+ std::vector<FeatureStatsType> backgroundBleu_;
+ FeatureStatsType backgroundRefLength_;
+
+ void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const;
+ size_t GetTargetLength(const Edge& edge) const;
+};
+
+struct HgHypothesis {
+ SparseVector featureVector;
+ WordVec text;
+ std::vector<FeatureStatsType> bleuStats;
+};
+
+void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo);
+
+};
+
+#endif
diff --git a/mert/ForestRescoreTest.cpp b/mert/ForestRescoreTest.cpp
new file mode 100644
index 000000000..86975d3a5
--- /dev/null
+++ b/mert/ForestRescoreTest.cpp
@@ -0,0 +1,246 @@
+#include <iostream>
+
+#include "ForestRescore.h"
+
+#define BOOST_TEST_MODULE MertForestRescore
+#include <boost/test/unit_test.hpp>
+
+
+
+using namespace std;
+using namespace MosesTuning;
+
+BOOST_AUTO_TEST_CASE(viterbi_simple_lattice)
+{
+ Vocab vocab;
+ WordVec words;
+ string wordStrings[] =
+ {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g"};
+ for (size_t i = 0; i < 9; ++i) {
+ words.push_back(&(vocab.FindOrAdd((wordStrings[i]))));
+ }
+
+ const string f1 = "foo";
+ const string f2 = "bar";
+ Graph graph(vocab);
+ graph.SetCounts(5,5);
+
+ Edge* e0 = graph.NewEdge();
+ e0->AddWord(words[0]);
+ e0->AddFeature(f1, 2.0);
+
+ Vertex* v0 = graph.NewVertex();
+ v0->AddEdge(e0);
+
+ Edge* e1 = graph.NewEdge();
+ e1->AddWord(NULL);
+ e1->AddChild(0);
+ e1->AddWord(words[2]);
+ e1->AddWord(words[3]);
+ e1->AddFeature(f1, 1.0);
+ e1->AddFeature(f2, 3.0);
+
+ Vertex* v1 = graph.NewVertex();
+ v1->AddEdge(e1);
+
+ Edge* e2 = graph.NewEdge();
+ e2->AddWord(NULL);
+ e2->AddChild(1);
+ e2->AddWord(words[4]);
+ e2->AddWord(words[5]);
+ e2->AddFeature(f2, 2.5);
+
+ Vertex* v2 = graph.NewVertex();
+ v2->AddEdge(e2);
+
+ Edge* e3 = graph.NewEdge();
+ e3->AddWord(NULL);
+ e3->AddChild(2);
+ e3->AddWord(words[6]);
+ e3->AddWord(words[7]);
+ e3->AddWord(words[8]);
+ e3->AddFeature(f1, -1);
+
+ Vertex* v3 = graph.NewVertex();
+ v3->AddEdge(e3);
+
+ Edge* e4 = graph.NewEdge();
+ e4->AddWord(NULL);
+ e4->AddChild(3);
+ e4->AddWord(words[1]);
+ e3->AddFeature(f2, 0.5);
+
+ Vertex* v4 = graph.NewVertex();
+ v4->AddEdge(e4);
+
+ ReferenceSet references;
+ references.AddLine(0, "a b c k e f o", vocab);
+ HgHypothesis modelHypo;
+ vector<FeatureStatsType> bg(kBleuNgramOrder*2+1);
+ SparseVector weights;
+ weights.set(f1,2);
+ weights.set(f2,1);
+ Viterbi(graph, weights, 0, references, 0, bg, &modelHypo);
+ BOOST_CHECK_CLOSE(2.0,modelHypo.featureVector.get(f1), 0.0001);
+ BOOST_CHECK_CLOSE(6.0,modelHypo.featureVector.get(f2), 0.0001);
+
+ BOOST_CHECK_EQUAL(words[0]->first, modelHypo.text[0]->first);
+ BOOST_CHECK_EQUAL(words[2]->first, modelHypo.text[1]->first);
+ BOOST_CHECK_EQUAL(words[3]->first, modelHypo.text[2]->first);
+ BOOST_CHECK_EQUAL(words[4]->first, modelHypo.text[3]->first);
+ BOOST_CHECK_EQUAL(words[5]->first, modelHypo.text[4]->first);
+ BOOST_CHECK_EQUAL(words[6]->first, modelHypo.text[5]->first);
+ BOOST_CHECK_EQUAL(words[7]->first, modelHypo.text[6]->first);
+ BOOST_CHECK_EQUAL(words[8]->first, modelHypo.text[7]->first);
+ BOOST_CHECK_EQUAL(words[1]->first, modelHypo.text[8]->first);
+}
+
+
+
+BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice)
+{
+ Vocab vocab;
+ WordVec words;
+ string wordStrings[] =
+ {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
+ for (size_t i = 0; i < 13; ++i) {
+ words.push_back(&(vocab.FindOrAdd((wordStrings[i]))));
+ }
+
+ const string f1 = "foo";
+ const string f2 = "bar";
+ Graph graph(vocab);
+ graph.SetCounts(5,8);
+
+ Edge* e0 = graph.NewEdge();
+ e0->AddWord(words[0]);
+
+ Vertex* v0 = graph.NewVertex();
+ v0->AddEdge(e0);
+
+ Edge* e1 = graph.NewEdge();
+ e1->AddWord(NULL);
+ e1->AddChild(0);
+ e1->AddWord(words[2]);
+ e1->AddWord(words[3]);
+ e1->AddFeature(f1,1);
+ e1->AddFeature(f2,1);
+ Edge* e5 = graph.NewEdge();
+ e5->AddWord(NULL);
+ e5->AddChild(0);
+ e5->AddWord(words[9]);
+ e5->AddWord(words[10]);
+ e5->AddFeature(f1,2);
+ e5->AddFeature(f2,-2);
+
+ Vertex* v1 = graph.NewVertex();
+ v1->AddEdge(e1);
+ v1->AddEdge(e5);
+ v1->SetSourceCovered(1);
+
+ Edge* e2 = graph.NewEdge();
+ e2->AddWord(NULL);
+ e2->AddChild(1);
+ e2->AddWord(words[4]);
+ e2->AddWord(words[5]);
+ e2->AddFeature(f2,3);
+
+ Vertex* v2 = graph.NewVertex();
+ v2->AddEdge(e2);
+ v2->SetSourceCovered(3);
+
+ Edge* e3 = graph.NewEdge();
+ e3->AddWord(NULL);
+ e3->AddChild(2);
+ e3->AddWord(words[6]);
+ e3->AddWord(words[7]);
+ e3->AddWord(words[8]);
+ e3->AddFeature(f1,1);
+ Edge* e6 = graph.NewEdge();
+ e6->AddWord(NULL);
+ e6->AddChild(2);
+ e6->AddWord(words[9]);
+ e6->AddWord(words[12]);
+ e6->AddFeature(f2,1);
+ Edge* e7 = graph.NewEdge();
+ e7->AddWord(NULL);
+ e7->AddChild(1);
+ e7->AddWord(words[11]);
+ e7->AddWord(words[12]);
+ e7->AddFeature(f1,2);
+ e7->AddFeature(f2,3);
+
+ Vertex* v3 = graph.NewVertex();
+ v3->AddEdge(e3);
+ v3->AddEdge(e6);
+ v3->AddEdge(e7);
+ v3->SetSourceCovered(5);
+
+ Edge* e4 = graph.NewEdge();
+ e4->AddWord(NULL);
+ e4->AddChild(3);
+ e4->AddWord(words[1]);
+
+ Vertex* v4 = graph.NewVertex();
+ v4->AddEdge(e4);
+ v4->SetSourceCovered(6);
+
+ /*Paths || foo || bar || s(2,1)
+ ab cd hk || 1 || 5 || 7
+ hi cd hk || 2 || 2 || 6
+ ab jk || 3 || 4 || 10
+ hi jk || 4 || 1 || 9
+ ab cd efg || 2 || 4 || 8
+ hi cd efg || 3 || 1 || 7
+ */
+
+ ReferenceSet references;
+ references.AddLine(0, "a b c d h k", vocab);
+ HgHypothesis modelHypo;
+ vector<FeatureStatsType> bg(kBleuNgramOrder*2+1, 0.1);
+ SparseVector weights;
+ weights.set(f1,2);
+ weights.set(f2,1);
+ Viterbi(graph, weights, 0, references, 0, bg, &modelHypo);
+ BOOST_CHECK_CLOSE(3.0,modelHypo.featureVector.get(f1), 0.0001);
+ BOOST_CHECK_CLOSE(4.0,modelHypo.featureVector.get(f2), 0.0001);
+
+ BOOST_CHECK_EQUAL(6, modelHypo.text.size());
+
+ //expect ab jk
+ BOOST_CHECK_EQUAL(words[0]->first, modelHypo.text[0]->first);
+ BOOST_CHECK_EQUAL(words[2]->first, modelHypo.text[1]->first);
+ BOOST_CHECK_EQUAL(words[3]->first, modelHypo.text[2]->first);
+ BOOST_CHECK_EQUAL(words[11]->first, modelHypo.text[3]->first);
+ BOOST_CHECK_EQUAL(words[12]->first, modelHypo.text[4]->first);
+ BOOST_CHECK_EQUAL(words[1]->first, modelHypo.text[5]->first);
+
+
+ HgHypothesis hopeHypo;
+ Viterbi(graph, weights, 1, references, 0, bg, &hopeHypo);
+ //expect abcdhk
+ BOOST_CHECK_EQUAL(8, hopeHypo.text.size());
+
+ BOOST_CHECK_EQUAL(words[0]->first, hopeHypo.text[0]->first);
+ BOOST_CHECK_EQUAL(words[2]->first, hopeHypo.text[1]->first);
+ BOOST_CHECK_EQUAL(words[3]->first, hopeHypo.text[2]->first);
+ BOOST_CHECK_EQUAL(words[4]->first, hopeHypo.text[3]->first);
+ BOOST_CHECK_EQUAL(words[5]->first, hopeHypo.text[4]->first);
+ BOOST_CHECK_EQUAL(words[9]->first, hopeHypo.text[5]->first);
+ BOOST_CHECK_EQUAL(words[12]->first, hopeHypo.text[6]->first);
+ BOOST_CHECK_EQUAL(words[1]->first, hopeHypo.text[7]->first);
+
+ BOOST_CHECK_EQUAL(kBleuNgramOrder*2+1, hopeHypo.bleuStats.size());
+ BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[0]);
+ BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[1]);
+ BOOST_CHECK_EQUAL(5, hopeHypo.bleuStats[2]);
+ BOOST_CHECK_EQUAL(5, hopeHypo.bleuStats[3]);
+ BOOST_CHECK_EQUAL(4, hopeHypo.bleuStats[4]);
+ BOOST_CHECK_EQUAL(4, hopeHypo.bleuStats[5]);
+ BOOST_CHECK_EQUAL(3, hopeHypo.bleuStats[6]);
+ BOOST_CHECK_EQUAL(3, hopeHypo.bleuStats[7]);
+ BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[8]);
+}
+
+
+
diff --git a/mert/HopeFearDecoder.cpp b/mert/HopeFearDecoder.cpp
new file mode 100644
index 000000000..d54d29936
--- /dev/null
+++ b/mert/HopeFearDecoder.cpp
@@ -0,0 +1,343 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include <algorithm>
+#include <cmath>
+#include <iterator>
+
+#define BOOST_FILESYSTEM_VERSION 3
+#include <boost/filesystem.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "util/exception.hh"
+#include "util/file_piece.hh"
+
+#include "Scorer.h"
+#include "HopeFearDecoder.h"
+
+using namespace std;
+namespace fs = boost::filesystem;
+
+namespace MosesTuning {
+
+static const ValType BLEU_RATIO = 5;
+
+ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv) {
+ vector<ValType> stats(scorer_->NumberOfScores(),0);
+ for(reset(); !finished(); next()) {
+ vector<ValType> sent;
+ MaxModel(wv,&sent);
+ for(size_t i=0; i<sent.size(); i++) {
+ stats[i]+=sent[i];
+ }
+ }
+ return scorer_->calculateScore(stats);
+}
+
+NbestHopeFearDecoder::NbestHopeFearDecoder(
+ const vector<string>& featureFiles,
+ const vector<string>& scoreFiles,
+ bool streaming,
+ bool no_shuffle,
+ bool safe_hope,
+ Scorer* scorer
+ ) : safe_hope_(safe_hope) {
+ scorer_ = scorer;
+ if (streaming) {
+ train_.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles));
+ } else {
+ train_.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle));
+ }
+}
+
+
+void NbestHopeFearDecoder::next() {
+ train_->next();
+}
+
+bool NbestHopeFearDecoder::finished() {
+ return train_->finished();
+}
+
+void NbestHopeFearDecoder::reset() {
+ train_->reset();
+}
+
+void NbestHopeFearDecoder::HopeFear(
+ const std::vector<ValType>& backgroundBleu,
+ const MiraWeightVector& wv,
+ HopeFearData* hopeFear
+ ) {
+
+
+ // Hope / fear decode
+ ValType hope_scale = 1.0;
+ size_t hope_index=0, fear_index=0, model_index=0;
+ ValType hope_score=0, fear_score=0, model_score=0;
+ for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
+ ValType hope_bleu, hope_model;
+ for(size_t i=0; i< train_->cur_size(); i++) {
+ const MiraFeatureVector& vec=train_->featuresAt(i);
+ ValType score = wv.score(vec);
+ ValType bleu = scorer_->calculateSentenceLevelBackgroundScore(train_->scoresAt(i),backgroundBleu);
+ // Hope
+ if(i==0 || (hope_scale*score + bleu) > hope_score) {
+ hope_score = hope_scale*score + bleu;
+ hope_index = i;
+ hope_bleu = bleu;
+ hope_model = score;
+ }
+ // Fear
+ if(i==0 || (score - bleu) > fear_score) {
+ fear_score = score - bleu;
+ fear_index = i;
+ }
+ // Model
+ if(i==0 || score > model_score) {
+ model_score = score;
+ model_index = i;
+ }
+ }
+ // Outer loop rescales the contribution of model score to 'hope' in antagonistic cases
+ // where model score is having far more influence than BLEU
+ hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU
+ if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale)
+ hope_scale = abs(hope_bleu) / abs(hope_model);
+ else break;
+ }
+ hopeFear->modelFeatures = train_->featuresAt(model_index);
+ hopeFear->hopeFeatures = train_->featuresAt(hope_index);
+ hopeFear->fearFeatures = train_->featuresAt(fear_index);
+
+ hopeFear->hopeStats = train_->scoresAt(hope_index);
+ hopeFear->hopeBleu = scorer_->calculateSentenceLevelBackgroundScore(hopeFear->hopeStats, backgroundBleu);
+ const vector<float>& fear_stats = train_->scoresAt(fear_index);
+ hopeFear->fearBleu = scorer_->calculateSentenceLevelBackgroundScore(fear_stats, backgroundBleu);
+
+ hopeFear->modelStats = train_->scoresAt(model_index);
+ hopeFear->hopeFearEqual = (hope_index == fear_index);
+}
+
+void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats) {
+ // Find max model
+ size_t max_index=0;
+ ValType max_score=0;
+ for(size_t i=0; i<train_->cur_size(); i++) {
+ MiraFeatureVector vec(train_->featuresAt(i));
+ ValType score = wv.score(vec);
+ if(i==0 || score > max_score) {
+ max_index = i;
+ max_score = score;
+ }
+ }
+ *stats = train_->scoresAt(max_index);
+}
+
+
+
+HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
+ (
+ const string& hypergraphDir,
+ const vector<string>& referenceFiles,
+ size_t num_dense,
+ bool streaming,
+ bool no_shuffle,
+ bool safe_hope,
+ size_t hg_pruning,
+ const MiraWeightVector& wv,
+ Scorer* scorer
+ ) :
+ num_dense_(num_dense) {
+
+ UTIL_THROW_IF(streaming, util::Exception, "Streaming not currently supported for hypergraphs");
+ UTIL_THROW_IF(!fs::exists(hypergraphDir), HypergraphException, "Directory '" << hypergraphDir << "' does not exist");
+ UTIL_THROW_IF(!referenceFiles.size(), util::Exception, "No reference files supplied");
+ references_.Load(referenceFiles, vocab_);
+
+ SparseVector weights;
+ wv.ToSparse(&weights);
+ scorer_ = scorer;
+
+ static const string kWeights = "weights";
+ fs::directory_iterator dend;
+ size_t fileCount = 0;
+
+ cerr << "Reading hypergraphs" << endl;
+ for (fs::directory_iterator di(hypergraphDir); di != dend; ++di) {
+ const fs::path& hgpath = di->path();
+ if (hgpath.filename() == kWeights) continue;
+ Graph graph(vocab_);
+ size_t id = boost::lexical_cast<size_t>(hgpath.stem().string());
+ util::scoped_fd fd(util::OpenReadOrThrow(hgpath.string().c_str()));
+ //util::FilePiece file(di->path().string().c_str());
+ util::FilePiece file(fd.release());
+ ReadGraph(file,graph);
+
+ //cerr << "ref length " << references_.Length(id) << endl;
+ size_t edgeCount = hg_pruning * references_.Length(id);
+ boost::shared_ptr<Graph> prunedGraph;
+ prunedGraph.reset(new Graph(vocab_));
+ graph.Prune(prunedGraph.get(), weights, edgeCount);
+ graphs_[id] = prunedGraph;
+ //cerr << "Pruning to v=" << graphs_[id]->VertexSize() << " e=" << graphs_[id]->EdgeSize() << endl;
+ ++fileCount;
+ if (fileCount % 10 == 0) cerr << ".";
+ if (fileCount % 400 == 0) cerr << " [count=" << fileCount << "]\n";
+ }
+ cerr << endl << "Done" << endl;
+
+ sentenceIds_.resize(graphs_.size());
+ for (size_t i = 0; i < graphs_.size(); ++i) sentenceIds_[i] = i;
+ if (!no_shuffle) {
+ random_shuffle(sentenceIds_.begin(), sentenceIds_.end());
+ }
+
+}
+
+void HypergraphHopeFearDecoder::reset() {
+ sentenceIdIter_ = sentenceIds_.begin();
+}
+
+void HypergraphHopeFearDecoder::next() {
+ sentenceIdIter_++;
+}
+
+bool HypergraphHopeFearDecoder::finished() {
+ return sentenceIdIter_ == sentenceIds_.end();
+}
+
+void HypergraphHopeFearDecoder::HopeFear(
+ const vector<ValType>& backgroundBleu,
+ const MiraWeightVector& wv,
+ HopeFearData* hopeFear
+ ) {
+ size_t sentenceId = *sentenceIdIter_;
+ SparseVector weights;
+ wv.ToSparse(&weights);
+ const Graph& graph = *(graphs_[sentenceId]);
+
+ ValType hope_scale = 1.0;
+ HgHypothesis hopeHypo, fearHypo, modelHypo;
+ for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
+
+ //hope decode
+ Viterbi(graph, weights, 1, references_, sentenceId, backgroundBleu, &hopeHypo);
+
+ //fear decode
+ Viterbi(graph, weights, -1, references_, sentenceId, backgroundBleu, &fearHypo);
+
+ //Model decode
+ Viterbi(graph, weights, 0, references_, sentenceId, backgroundBleu, &modelHypo);
+
+
+ // Outer loop rescales the contribution of model score to 'hope' in antagonistic cases
+ // where model score is having far more influence than BLEU
+ // hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU
+ // if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale)
+ // hope_scale = abs(hope_bleu) / abs(hope_model);
+ // else break;
+ //TODO: Don't currently get model and bleu so commented this out for now.
+ break;
+ }
+ //modelFeatures, hopeFeatures and fearFeatures
+ hopeFear->modelFeatures = MiraFeatureVector(modelHypo.featureVector, num_dense_);
+ hopeFear->hopeFeatures = MiraFeatureVector(hopeHypo.featureVector, num_dense_);
+ hopeFear->fearFeatures = MiraFeatureVector(fearHypo.featureVector, num_dense_);
+
+ //Need to know which are to be mapped to dense features!
+
+ //Only C++11
+ //hopeFear->modelStats.assign(std::begin(modelHypo.bleuStats), std::end(modelHypo.bleuStats));
+ vector<ValType> fearStats(scorer_->NumberOfScores());
+ hopeFear->hopeStats.reserve(scorer_->NumberOfScores());
+ hopeFear->modelStats.reserve(scorer_->NumberOfScores());
+ for (size_t i = 0; i < fearStats.size(); ++i) {
+ hopeFear->modelStats.push_back(modelHypo.bleuStats[i]);
+ hopeFear->hopeStats.push_back(hopeHypo.bleuStats[i]);
+
+ fearStats[i] = fearHypo.bleuStats[i];
+ }
+ /*
+ cerr << "hope" << endl;;
+ for (size_t i = 0; i < hopeHypo.text.size(); ++i) {
+ cerr << hopeHypo.text[i]->first << " ";
+ }
+ cerr << endl;
+ for (size_t i = 0; i < fearStats.size(); ++i) {
+ cerr << hopeHypo.bleuStats[i] << " ";
+ }
+ cerr << endl;
+ cerr << "fear";
+ for (size_t i = 0; i < fearHypo.text.size(); ++i) {
+ cerr << fearHypo.text[i]->first << " ";
+ }
+ cerr << endl;
+ for (size_t i = 0; i < fearStats.size(); ++i) {
+ cerr << fearHypo.bleuStats[i] << " ";
+ }
+ cerr << endl;
+ cerr << "model";
+ for (size_t i = 0; i < modelHypo.text.size(); ++i) {
+ cerr << modelHypo.text[i]->first << " ";
+ }
+ cerr << endl;
+ for (size_t i = 0; i < fearStats.size(); ++i) {
+ cerr << modelHypo.bleuStats[i] << " ";
+ }
+ cerr << endl;
+ */
+ hopeFear->hopeBleu = sentenceLevelBackgroundBleu(hopeFear->hopeStats, backgroundBleu);
+ hopeFear->fearBleu = sentenceLevelBackgroundBleu(fearStats, backgroundBleu);
+
+ //If fv and bleu stats are equal, then assume equal
+ hopeFear->hopeFearEqual = true; //(hopeFear->hopeBleu - hopeFear->fearBleu) >= 1e-8;
+ if (hopeFear->hopeFearEqual) {
+ for (size_t i = 0; i < fearStats.size(); ++i) {
+ if (fearStats[i] != hopeFear->hopeStats[i]) {
+ hopeFear->hopeFearEqual = false;
+ break;
+ }
+ }
+ }
+ hopeFear->hopeFearEqual = hopeFear->hopeFearEqual && (hopeFear->fearFeatures == hopeFear->hopeFeatures);
+}
+
+void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValType>* stats) {
+ assert(!finished());
+ HgHypothesis bestHypo;
+ size_t sentenceId = *sentenceIdIter_;
+ SparseVector weights;
+ wv.ToSparse(&weights);
+ vector<ValType> bg(scorer_->NumberOfScores());
+ Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo);
+ stats->resize(bestHypo.bleuStats.size());
+ /*
+ for (size_t i = 0; i < bestHypo.text.size(); ++i) {
+ cerr << bestHypo.text[i]->first << " ";
+ }
+ cerr << endl;
+ */
+ for (size_t i = 0; i < bestHypo.bleuStats.size(); ++i) {
+ (*stats)[i] = bestHypo.bleuStats[i];
+ }
+}
+
+
+
+};
diff --git a/mert/HopeFearDecoder.h b/mert/HopeFearDecoder.h
new file mode 100644
index 000000000..d1881eeb2
--- /dev/null
+++ b/mert/HopeFearDecoder.h
@@ -0,0 +1,160 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+#ifndef MERT_HOPEFEARDECODER_H
+#define MERT_HOPEFEARDECODER_H
+
+#include <vector>
+
+#include <boost/scoped_ptr.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include "ForestRescore.h"
+#include "Hypergraph.h"
+#include "HypPackEnumerator.h"
+#include "MiraFeatureVector.h"
+#include "MiraWeightVector.h"
+
+//
+// Used by batch mira to get the hope, fear and model hypothesis. This wraps
+// the n-best list and lattice/hypergraph implementations
+//
+
+namespace MosesTuning {
+
+class Scorer;
+
+/** To be filled in by the decoder */
+struct HopeFearData {
+ MiraFeatureVector modelFeatures;
+ MiraFeatureVector hopeFeatures;
+ MiraFeatureVector fearFeatures;
+
+ std::vector<float> modelStats;
+ std::vector<float> hopeStats;
+
+ ValType hopeBleu;
+ ValType fearBleu;
+
+ bool hopeFearEqual;
+};
+
+//Abstract base class
+class HopeFearDecoder {
+public:
+ //iterator methods
+ virtual void reset() = 0;
+ virtual void next() = 0;
+ virtual bool finished() = 0;
+
+ virtual ~HopeFearDecoder() {};
+
+ /**
+ * Calculate hope, fear and model hypotheses
+ **/
+ virtual void HopeFear(
+ const std::vector<ValType>& backgroundBleu,
+ const MiraWeightVector& wv,
+ HopeFearData* hopeFear
+ ) = 0;
+
+ /** Max score decoding */
+ virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats)
+ = 0;
+
+ /** Calculate bleu on training set */
+ ValType Evaluate(const AvgWeightVector& wv);
+
+protected:
+ Scorer* scorer_;
+};
+
+
+/** Gets hope-fear from nbest lists */
+class NbestHopeFearDecoder : public virtual HopeFearDecoder {
+public:
+ NbestHopeFearDecoder(const std::vector<std::string>& featureFiles,
+ const std::vector<std::string>& scoreFiles,
+ bool streaming,
+ bool no_shuffle,
+ bool safe_hope,
+ Scorer* scorer
+ );
+
+ virtual void reset();
+ virtual void next();
+ virtual bool finished();
+
+ virtual void HopeFear(
+ const std::vector<ValType>& backgroundBleu,
+ const MiraWeightVector& wv,
+ HopeFearData* hopeFear
+ );
+
+ virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats);
+
+private:
+ boost::scoped_ptr<HypPackEnumerator> train_;
+ bool safe_hope_;
+
+};
+
+
+
+/** Gets hope-fear from hypergraphs */
+class HypergraphHopeFearDecoder : public virtual HopeFearDecoder {
+public:
+ HypergraphHopeFearDecoder(
+ const std::string& hypergraphDir,
+ const std::vector<std::string>& referenceFiles,
+ size_t num_dense,
+ bool streaming,
+ bool no_shuffle,
+ bool safe_hope,
+ size_t hg_pruning,
+ const MiraWeightVector& wv,
+ Scorer* scorer_
+ );
+
+ virtual void reset();
+ virtual void next();
+ virtual bool finished();
+
+ virtual void HopeFear(
+ const std::vector<ValType>& backgroundBleu,
+ const MiraWeightVector& wv,
+ HopeFearData* hopeFear
+ );
+
+ virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats);
+
+private:
+ size_t num_dense_;
+ //maps sentence Id to graph ptr
+ typedef std::map<size_t, boost::shared_ptr<Graph> > GraphColl;
+ GraphColl graphs_;
+ std::vector<size_t> sentenceIds_;
+ std::vector<size_t>::const_iterator sentenceIdIter_;
+ ReferenceSet references_;
+ Vocab vocab_;
+};
+
+};
+
+#endif
+
diff --git a/mert/HwcmScorer.cpp b/mert/HwcmScorer.cpp
new file mode 100644
index 000000000..6aff77def
--- /dev/null
+++ b/mert/HwcmScorer.cpp
@@ -0,0 +1,165 @@
+#include "HwcmScorer.h"
+
+#include <fstream>
+
+#include "ScoreStats.h"
+#include "Util.h"
+
+#include "util/tokenize_piece.hh"
+
+// HWCM score (Liu and Gildea, 2005). Implements F1 instead of precision for better modelling of hypothesis length.
+// assumes dependency trees on target side (generated by scripts/training/wrappers/conll2mosesxml.py ; use with option --brackets for reference).
+// reads reference trees from separate file {REFERENCE_FILE}.trees to support mix of string-based and tree-based metrics.
+
+using namespace std;
+
+namespace MosesTuning
+{
+
+
+HwcmScorer::HwcmScorer(const string& config)
+ : StatisticsBasedScorer("HWCM",config) {}
+
+HwcmScorer::~HwcmScorer() {}
+
+void HwcmScorer::setReferenceFiles(const vector<string>& referenceFiles)
+{
+ // For each line in the reference file, create a tree object
+ if (referenceFiles.size() != 1) {
+ throw runtime_error("HWCM only supports a single reference");
+ }
+ m_ref_trees.clear();
+ m_ref_hwc.clear();
+ ifstream in((referenceFiles[0] + ".trees").c_str());
+ if (!in) {
+ throw runtime_error("Unable to open " + referenceFiles[0] + ".trees");
+ }
+ string line;
+ while (getline(in,line)) {
+ line = this->preprocessSentence(line);
+ TreePointer tree (boost::make_shared<InternalTree>(line));
+ m_ref_trees.push_back(tree);
+ vector<map<string, int> > hwc (kHwcmOrder);
+ vector<string> history(kHwcmOrder);
+ extractHeadWordChain(tree, history, hwc);
+ m_ref_hwc.push_back(hwc);
+ vector<int> totals(kHwcmOrder);
+ for (size_t i = 0; i < kHwcmOrder; i++) {
+ for (map<string, int>::const_iterator it = m_ref_hwc.back()[i].begin(); it != m_ref_hwc.back()[i].end(); it++) {
+ totals[i] += it->second;
+ }
+ }
+ m_ref_lengths.push_back(totals);
+ }
+ TRACE_ERR(endl);
+
+}
+
+void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history, vector<map<string, int> > & hwc) {
+
+ if (tree->GetLength() > 0) {
+ string head = getHead(tree);
+
+ if (head.empty()) {
+ for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) {
+ extractHeadWordChain(*it, history, hwc);
+ }
+ }
+ else {
+ vector<string> new_history(kHwcmOrder);
+ new_history[0] = head;
+ hwc[0][head]++;
+ for (size_t hist_idx = 0; hist_idx < kHwcmOrder-1; hist_idx++) {
+ if (!history[hist_idx].empty()) {
+ string chain = history[hist_idx] + " " + head;
+ hwc[hist_idx+1][chain]++;
+ if (hist_idx+2 < kHwcmOrder) {
+ new_history[hist_idx+1] = chain;
+ }
+ }
+ }
+ for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) {
+ extractHeadWordChain(*it, new_history, hwc);
+ }
+ }
+ }
+}
+
+string HwcmScorer::getHead(TreePointer tree) {
+ // assumption (only true for dependency parse: each constituent has a preterminal label, and corresponding terminal is head)
+ // if constituent has multiple preterminals, first one is picked; if it has no preterminals, empty string is returned
+ for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it)
+ {
+ TreePointer child = *it;
+
+ if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) {
+ return child->GetChildren()[0]->GetLabel();
+ }
+ }
+ return "";
+
+}
+
+void HwcmScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
+{
+ if (sid >= m_ref_trees.size()) {
+ stringstream msg;
+ msg << "Sentence id (" << sid << ") not found in reference set";
+ throw runtime_error(msg.str());
+ }
+
+ string sentence = this->preprocessSentence(text);
+
+ // if sentence has '|||', assume that tree is in second position (n-best-list);
+ // otherwise, assume it is in first position (calling 'evaluate' with tree as reference)
+ util::TokenIter<util::MultiCharacter> it(sentence, util::MultiCharacter("|||"));
+ ++it;
+ if (it) {
+ sentence = it->as_string();
+ }
+
+ TreePointer tree (boost::make_shared<InternalTree>(sentence));
+ vector<map<string, int> > hwc_test (kHwcmOrder);
+ vector<string> history(kHwcmOrder);
+ extractHeadWordChain(tree, history, hwc_test);
+
+ ostringstream stats;
+ for (size_t i = 0; i < kHwcmOrder; i++) {
+ int correct = 0;
+ int test_total = 0;
+ for (map<string, int>::const_iterator it = hwc_test[i].begin(); it != hwc_test[i].end(); it++) {
+ test_total += it->second;
+ map<string, int>::const_iterator it2 = m_ref_hwc[sid][i].find(it->first);
+ if (it2 != m_ref_hwc[sid][i].end()) {
+ correct += std::min(it->second, it2->second);
+ }
+ }
+ stats << correct << " " << test_total << " " << m_ref_lengths[sid][i] << " " ;
+ }
+
+ string stats_str = stats.str();
+ entry.set(stats_str);
+}
+
+float HwcmScorer::calculateScore(const vector<ScoreStatsType>& comps) const
+{
+ float precision = 0;
+ float recall = 0;
+ for (size_t i = 0; i < kHwcmOrder; i++) {
+ float matches = comps[i*3];
+ float test_total = comps[1+(i*3)];
+ float ref_total = comps[2+(i*3)];
+ if (test_total > 0) {
+ precision += matches/test_total;
+ }
+ if (ref_total > 0) {
+ recall += matches/ref_total;
+ }
+ }
+
+ precision /= (float)kHwcmOrder;
+ recall /= (float)kHwcmOrder;
+ return (2*precision*recall)/(precision+recall); // f1-score
+}
+
+} \ No newline at end of file
diff --git a/mert/HwcmScorer.h b/mert/HwcmScorer.h
new file mode 100644
index 000000000..16d563424
--- /dev/null
+++ b/mert/HwcmScorer.h
@@ -0,0 +1,64 @@
+#ifndef MERT_HWCM_SCORER_H_
+#define MERT_HWCM_SCORER_H_
+
+#include <string>
+#include <vector>
+
+#include "StatisticsBasedScorer.h"
+#include "moses/FF/InternalTree.h"
+
+using Moses::TreePointer;
+using Moses::InternalTree;
+
+namespace MosesTuning
+{
+
+
+class ScoreStats;
+const size_t kHwcmOrder = 4;
+
+/**
+ * HWCM scoring (Liu and Gildea 2005), but F1 instead of precision.
+ */
+class HwcmScorer: public StatisticsBasedScorer
+{
+public:
+ explicit HwcmScorer(const std::string& config = "");
+ ~HwcmScorer();
+
+ virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
+ virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
+
+ virtual std::size_t NumberOfScores() const {
+ return kHwcmOrder*3;
+ }
+
+ virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const;
+
+ virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const {
+ return totals[2];
+ }
+
+ //TODO: actually, we use trees which we store in place of alignment. Maybe use something analogous to Phrase Properties to cleanly store trees?
+ bool useAlignment() const {
+ return true;
+ }
+
+private:
+
+ // data extracted from reference files
+ std::vector<TreePointer> m_ref_trees;
+ std::vector<std::vector<std::map<std::string, int> > > m_ref_hwc;
+ std::vector<std::vector<int> > m_ref_lengths;
+
+ void extractHeadWordChain(TreePointer tree, std::vector<std::string> & history, std::vector<std::map<std::string, int> > & hwc);
+ std::string getHead(TreePointer tree);
+
+ // no copying allowed
+ HwcmScorer(const HwcmScorer&);
+ HwcmScorer& operator=(const HwcmScorer&);
+};
+
+}
+
+#endif // MERT_HWCM_SCORER_H_ \ No newline at end of file
diff --git a/mert/Hypergraph.cpp b/mert/Hypergraph.cpp
new file mode 100644
index 000000000..b7725ead0
--- /dev/null
+++ b/mert/Hypergraph.cpp
@@ -0,0 +1,313 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+#include <iostream>
+#include <set>
+
+#include <boost/lexical_cast.hpp>
+
+#include "util/double-conversion/double-conversion.h"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "Hypergraph.h"
+
+using namespace std;
+static const string kBOS = "<s>";
+static const string kEOS = "</s>";
+
+namespace MosesTuning {
+
+StringPiece NextLine(util::FilePiece& from) {
+ StringPiece line;
+ while ((line = from.ReadLine()).starts_with("#"));
+ return line;
+}
+
+Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS)){
+}
+
+const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str) {
+#if BOOST_VERSION >= 104200
+ Map::const_iterator i= map_.find(str, Hash(), Equals());
+#else
+ std::string copied_str(str.data(), str.size());
+ Map::const_iterator i = map_.find(copied_str.c_str());
+#endif
+ if (i != map_.end()) return *i;
+ char *copied = static_cast<char*>(piece_backing_.Allocate(str.size() + 1));
+ memcpy(copied, str.data(), str.size());
+ copied[str.size()] = 0;
+ return *map_.insert(Entry(copied, map_.size())).first;
+}
+
+double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
+
+
+/**
+ * Reads an incoming edge. Returns edge and source words covered.
+**/
+static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) {
+ Edge* edge = graph.NewEdge();
+ StringPiece line = from.ReadLine(); //Don't allow comments within edge lists
+ util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| "));
+ //Target
+ for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) {
+ StringPiece got = *i;
+ if ('[' == *got.data() && ']' == got.data()[got.size() - 1]) {
+ // non-terminal
+ char *end_ptr;
+ unsigned long int child = std::strtoul(got.data() + 1, &end_ptr, 10);
+ UTIL_THROW_IF(end_ptr != got.data() + got.size() - 1, HypergraphException, "Bad non-terminal" << got);
+ UTIL_THROW_IF(child >= graph.VertexSize(), HypergraphException, "Reference to vertex " << child << " but we only have " << graph.VertexSize() << " vertices. Is the file in bottom-up format?");
+ edge->AddWord(NULL);
+ edge->AddChild(child);
+ } else {
+ const Vocab::Entry &found = graph.MutableVocab().FindOrAdd(got);
+ edge->AddWord(&found);
+ }
+ }
+
+ //Features
+ ++pipes;
+ for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) {
+ StringPiece fv = *i;
+ if (!fv.size()) break;
+ size_t equals = fv.find_last_of("=");
+ UTIL_THROW_IF(equals == fv.npos, HypergraphException, "Failed to parse feature '" << fv << "'");
+ StringPiece name = fv.substr(0,equals);
+ StringPiece value = fv.substr(equals+1);
+ int processed;
+ float score = converter.StringToFloat(value.data(), value.length(), &processed);
+ UTIL_THROW_IF(isnan(score), HypergraphException, "Failed to parse weight '" << value << "'");
+ edge->AddFeature(name,score);
+ }
+ //Covered words
+ ++pipes;
+ size_t sourceCovered = boost::lexical_cast<size_t>(*pipes);
+ return pair<Edge*,size_t>(edge,sourceCovered);
+}
+
+void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const {
+
+ Graph& newGraph = *pNewGraph;
+ //TODO: Optimise case where no pruning required
+
+ //For debug
+
+
+ /*
+ map<const Edge*, string> edgeIds;
+ for (size_t i = 0; i < edges_.Size(); ++i) {
+ stringstream str;
+ size_t childId = 0;
+ for (size_t j = 0; j < edges_[i].Words().size(); ++j) {
+ if (edges_[i].Words()[j]) {
+ str << edges_[i].Words()[j]->first << " ";
+ } else {
+ str << "[" << edges_[i].Children()[childId++] << "] ";
+ }
+ }
+ edgeIds[&(edges_[i])] = str.str();
+ }
+ */
+
+ //end For debug
+
+ map<const Edge*, FeatureStatsType> edgeBackwardScores;
+ map<const Edge*, size_t> edgeHeads;
+ vector<FeatureStatsType> vertexBackwardScores(vertices_.Size(), kMinScore);
+ vector<vector<const Edge*> > outgoing(vertices_.Size());
+
+ //Compute backward scores
+ for (size_t vi = 0; vi < vertices_.Size(); ++vi) {
+ // cerr << "Vertex " << vi << endl;
+ const Vertex& vertex = vertices_[vi];
+ const vector<const Edge*>& incoming = vertex.GetIncoming();
+ if (!incoming.size()) {
+ vertexBackwardScores[vi] = 0;
+ } else {
+ for (size_t ei = 0; ei < incoming.size(); ++ei) {
+ //cerr << "Edge " << edgeIds[incoming[ei]] << endl;
+ edgeHeads[incoming[ei]]= vi;
+ FeatureStatsType incomingScore = incoming[ei]->GetScore(weights);
+ for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) {
+ //cerr << "\tChild " << incoming[ei]->Children()[i] << endl;
+ size_t childId = incoming[ei]->Children()[i];
+ UTIL_THROW_IF(vertexBackwardScores[childId] == kMinScore,
+ HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId);
+ outgoing[childId].push_back(incoming[ei]);
+ incomingScore += vertexBackwardScores[childId];
+ }
+ edgeBackwardScores[incoming[ei]]= incomingScore;
+ //cerr << "Backward score: " << incomingScore << endl;
+ if (incomingScore > vertexBackwardScores[vi]) vertexBackwardScores[vi] = incomingScore;
+ }
+ }
+ }
+
+ //Compute forward scores
+ vector<FeatureStatsType> vertexForwardScores(vertices_.Size(), kMinScore);
+ map<const Edge*, FeatureStatsType> edgeForwardScores;
+ for (size_t i = 1; i <= vertices_.Size(); ++i) {
+ size_t vi = vertices_.Size() - i;
+ //cerr << "Vertex " << vi << endl;
+ if (!outgoing[vi].size()) {
+ vertexForwardScores[vi] = 0;
+ } else {
+ for (size_t ei = 0; ei < outgoing[vi].size(); ++ei) {
+ //cerr << "Edge " << edgeIds[outgoing[vi][ei]] << endl;
+ FeatureStatsType outgoingScore = 0;
+ //add score of head
+ outgoingScore += vertexForwardScores[edgeHeads[outgoing[vi][ei]]];
+ //cerr << "Forward score " << outgoingScore << endl;
+ edgeForwardScores[outgoing[vi][ei]] = outgoingScore;
+ //sum scores of siblings
+ for (size_t i = 0; i < outgoing[vi][ei]->Children().size(); ++i) {
+ size_t siblingId = outgoing[vi][ei]->Children()[i];
+ if (siblingId != vi) {
+ //cerr << "\tSibling " << siblingId << endl;
+ outgoingScore += vertexBackwardScores[siblingId];
+ }
+ }
+ outgoingScore += outgoing[vi][ei]->GetScore(weights);
+ if (outgoingScore > vertexForwardScores[vi]) vertexForwardScores[vi] = outgoingScore;
+ //cerr << "Vertex " << vi << " forward score " << outgoingScore << endl;
+ }
+ }
+ }
+
+
+
+ multimap<FeatureStatsType, const Edge*> edgeScores;
+ for (size_t i = 0; i < edges_.Size(); ++i) {
+ const Edge* edge = &(edges_[i]);
+ if (edgeForwardScores.find(edge) == edgeForwardScores.end()) {
+ //This edge has no children, so didn't get a forward score. Its forward score
+ //is that of its head
+ edgeForwardScores[edge] = vertexForwardScores[edgeHeads[edge]];
+ }
+ FeatureStatsType score = edgeForwardScores[edge] + edgeBackwardScores[edge];
+ edgeScores.insert(pair<FeatureStatsType, const Edge*>(score,edge));
+ // cerr << edgeIds[edge] << " " << score << endl;
+ }
+
+
+
+ multimap<FeatureStatsType, const Edge*>::const_reverse_iterator ei = edgeScores.rbegin();
+ size_t edgeCount = 1;
+ while(edgeCount < minEdgeCount && ei != edgeScores.rend()) {
+ ++ei;
+ ++edgeCount;
+ }
+ multimap<FeatureStatsType, const Edge*>::const_iterator lowest = edgeScores.begin();
+ if (ei != edgeScores.rend()) lowest = edgeScores.lower_bound(ei->first);
+
+ //cerr << "Retained edges" << endl;
+ set<size_t> retainedVertices;
+ set<const Edge*> retainedEdges;
+ for (; lowest != edgeScores.end(); ++lowest) {
+ //cerr << lowest->first << " " << edgeIds[lowest->second] << endl;
+ retainedEdges.insert(lowest->second);
+ retainedVertices.insert(edgeHeads[lowest->second]);
+ for (size_t i = 0; i < lowest->second->Children().size(); ++i) {
+ retainedVertices.insert(lowest->second->Children()[i]);
+ }
+ }
+ newGraph.SetCounts(retainedVertices.size(), retainedEdges.size());
+
+ //cerr << "Retained vertices" << endl;
+ map<size_t,size_t> oldIdToNew;
+ size_t vi = 0;
+ for (set<size_t>::const_iterator i = retainedVertices.begin(); i != retainedVertices.end(); ++i, ++vi) {
+ //cerr << *i << " New: " << vi << endl;
+ oldIdToNew[*i] = vi;
+ Vertex* vertex = newGraph.NewVertex();
+ vertex->SetSourceCovered(vertices_[*i].SourceCovered());
+ }
+
+ for (set<const Edge*>::const_iterator i = retainedEdges.begin(); i != retainedEdges.end(); ++i) {
+ Edge* newEdge = newGraph.NewEdge();
+ const Edge* oldEdge = *i;
+ for (size_t j = 0; j < oldEdge->Words().size(); ++j) {
+ newEdge->AddWord(oldEdge->Words()[j]);
+ }
+ for (size_t j = 0; j < oldEdge->Children().size(); ++j) {
+ newEdge->AddChild(oldIdToNew[oldEdge->Children()[j]]);
+ }
+ newEdge->SetFeatures(oldEdge->Features());
+ Vertex& newHead = newGraph.vertices_[oldIdToNew[edgeHeads[oldEdge]]];
+ newHead.AddEdge(newEdge);
+ }
+
+ /*
+ cerr << "New graph" << endl;
+ for (size_t vi = 0; vi < newGraph.VertexSize(); ++vi) {
+ cerr << "Vertex " << vi << endl;
+ const vector<const Edge*> incoming = newGraph.GetVertex(vi).GetIncoming();
+ for (size_t ei = 0; ei < incoming.size(); ++ei) {
+ size_t childId = 0;
+ for (size_t wi = 0; wi < incoming[ei]->Words().size(); ++wi) {
+ const Vocab::Entry* word = incoming[ei]->Words()[wi];
+ if (word) {
+ cerr << word->first << " ";
+ } else {
+ cerr << "[" << incoming[ei]->Children()[childId++] << "] ";
+ }
+ }
+ cerr << " Score: " << incoming[ei]->GetScore(weights) << endl;
+ }
+ cerr << endl;
+ }
+ */
+
+
+}
+
+/**
+ * Read from "Kenneth's hypergraph" aka cdec target_graph format (with comments)
+**/
+void ReadGraph(util::FilePiece &from, Graph &graph) {
+
+ //First line should contain field names
+ StringPiece line = from.ReadLine();
+ UTIL_THROW_IF(line.compare("# target ||| features ||| source-covered") != 0, HypergraphException, "Incorrect format spec on first line: '" << line << "'");
+ line = NextLine(from);
+
+ //Then expect numbers of vertices
+ util::TokenIter<util::SingleCharacter, false> i(line, util::SingleCharacter(' '));
+ unsigned long int vertices = boost::lexical_cast<unsigned long int>(*i);
+ ++i;
+ unsigned long int edges = boost::lexical_cast<unsigned long int>(*i);
+ graph.SetCounts(vertices, edges);
+ //cerr << "vertices: " << vertices << "; edges: " << edges << endl;
+ for (size_t i = 0; i < vertices; ++i) {
+ line = NextLine(from);
+ unsigned long int edge_count = boost::lexical_cast<unsigned long int>(line);
+ Vertex* vertex = graph.NewVertex();
+ for (unsigned long int e = 0; e < edge_count; ++e) {
+ pair<Edge*,size_t> edge = ReadEdge(from, graph);
+ vertex->AddEdge(edge.first);
+ //Note: the file format attaches this to the edge, but it's really a property
+ //of the vertex.
+ if (!e) {vertex->SetSourceCovered(edge.second);}
+ }
+ }
+}
+
+};
diff --git a/mert/Hypergraph.h b/mert/Hypergraph.h
new file mode 100644
index 000000000..b6ee6c3f8
--- /dev/null
+++ b/mert/Hypergraph.h
@@ -0,0 +1,251 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#ifndef MERT_HYPERGRAPH_H
+#define MERT_HYPERGRAPH_H
+
+#include <string>
+
+#include <boost/noncopyable.hpp>
+#include <boost/scoped_array.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/functional/hash/hash.hpp>
+#include <boost/unordered_map.hpp>
+
+
+#include "util/exception.hh"
+#include "util/file_piece.hh"
+#include "util/murmur_hash.hh"
+#include "util/pool.hh"
+#include "util/string_piece.hh"
+
+#include "FeatureStats.h"
+
+namespace MosesTuning {
+
+typedef unsigned int WordIndex;
+const WordIndex kMaxWordIndex = UINT_MAX;
+const FeatureStatsType kMinScore = -std::numeric_limits<FeatureStatsType>::max();
+
+template <class T> class FixedAllocator : boost::noncopyable {
+ public:
+ FixedAllocator() : current_(NULL), end_(NULL) {}
+
+ void Init(std::size_t count) {
+ assert(!current_);
+ array_.reset(new T[count]);
+ current_ = array_.get();
+ end_ = current_ + count;
+ }
+
+ T &operator[](std::size_t idx) {
+ return array_.get()[idx];
+ }
+ const T &operator[](std::size_t idx) const {
+ return array_.get()[idx];
+ }
+
+ T *New() {
+ T *ret = current_++;
+ UTIL_THROW_IF(ret >= end_, util::Exception, "Allocating past end");
+ return ret;
+ }
+
+ std::size_t Capacity() const {
+ return end_ - array_.get();
+ }
+
+ std::size_t Size() const {
+ return current_ - array_.get();
+ }
+
+ private:
+ boost::scoped_array<T> array_;
+ T *current_, *end_;
+};
+
+
+class Vocab {
+ public:
+ Vocab();
+
+ typedef std::pair<const char *const, WordIndex> Entry;
+
+ const Entry &FindOrAdd(const StringPiece &str);
+
+ const Entry& Bos() const {return bos_;}
+
+ const Entry& Eos() const {return eos_;}
+
+ private:
+ util::Pool piece_backing_;
+
+ struct Hash : public std::unary_function<const char *, std::size_t> {
+ std::size_t operator()(StringPiece str) const {
+ return util::MurmurHashNative(str.data(), str.size());
+ }
+ };
+
+ struct Equals : public std::binary_function<const char *, const char *, bool> {
+ bool operator()(StringPiece first, StringPiece second) const {
+ return first == second;
+ }
+ };
+
+ typedef boost::unordered_map<const char *, WordIndex, Hash, Equals> Map;
+ Map map_;
+ Entry eos_;
+ Entry bos_;
+
+};
+
+typedef std::vector<const Vocab::Entry*> WordVec;
+
+class Vertex;
+
+//Use shared pointer to save copying when we prune
+typedef boost::shared_ptr<SparseVector> FeaturePtr;
+
+/**
+ * An edge has 1 head vertex, 0..n child (tail) vertices, a list of words and a feature vector.
+**/
+class Edge {
+ public:
+ Edge() {features_.reset(new SparseVector());}
+
+ void AddWord(const Vocab::Entry *word) {
+ words_.push_back(word);
+ }
+
+ void AddChild(size_t child) {
+ children_.push_back(child);
+ }
+
+ void AddFeature(const StringPiece& name, FeatureStatsType value) {
+ //TODO StringPiece interface
+ features_->set(name.as_string(),value);
+ }
+
+
+ const WordVec &Words() const {
+ return words_;
+ }
+
+ const FeaturePtr& Features() const {
+ return features_;
+ }
+
+ void SetFeatures(const FeaturePtr& features) {
+ features_ = features;
+ }
+
+ const std::vector<size_t>& Children() const {
+ return children_;
+ }
+
+ FeatureStatsType GetScore(const SparseVector& weights) const {
+ return inner_product(*(features_.get()), weights);
+ }
+
+ private:
+ // NULL for non-terminals.
+ std::vector<const Vocab::Entry*> words_;
+ std::vector<size_t> children_;
+ boost::shared_ptr<SparseVector> features_;
+};
+
+/*
+ * A vertex has 0..n incoming edges
+ **/
+class Vertex {
+ public:
+ Vertex() : sourceCovered_(0) {}
+
+ void AddEdge(const Edge* edge) {incoming_.push_back(edge);}
+
+ void SetSourceCovered(size_t sourceCovered) {sourceCovered_ = sourceCovered;}
+
+ const std::vector<const Edge*>& GetIncoming() const {return incoming_;}
+
+ size_t SourceCovered() const {return sourceCovered_;}
+
+ private:
+ std::vector<const Edge*> incoming_;
+ size_t sourceCovered_;
+};
+
+
+class Graph : boost::noncopyable {
+ public:
+ Graph(Vocab& vocab) : vocab_(vocab) {}
+
+ void SetCounts(std::size_t vertices, std::size_t edges) {
+ vertices_.Init(vertices);
+ edges_.Init(edges);
+ }
+
+ Vocab &MutableVocab() { return vocab_; }
+
+ Edge *NewEdge() {
+ return edges_.New();
+ }
+
+ Vertex *NewVertex() {
+ return vertices_.New();
+ }
+
+ const Vertex &GetVertex(std::size_t index) const {
+ return vertices_[index];
+ }
+
+ Edge &GetEdge(std::size_t index) {
+ return edges_[index];
+ }
+
+ /* Created a pruned copy of this graph with minEdgeCount edges. Uses
+ the scores in the max-product semiring to rank edges, as suggested by
+ Colin Cherry */
+ void Prune(Graph* newGraph, const SparseVector& weights, size_t minEdgeCount) const;
+
+ std::size_t VertexSize() const { return vertices_.Size(); }
+ std::size_t EdgeSize() const { return edges_.Size(); }
+
+ bool IsBoundary(const Vocab::Entry* word) const {
+ return word->second == vocab_.Bos().second || word->second == vocab_.Eos().second;
+ }
+
+ private:
+ FixedAllocator<Edge> edges_;
+ FixedAllocator<Vertex> vertices_;
+ Vocab& vocab_;
+};
+
+class HypergraphException : public util::Exception {
+ public:
+ HypergraphException() {}
+ ~HypergraphException() throw() {}
+};
+
+
+void ReadGraph(util::FilePiece &from, Graph &graph);
+
+
+};
+
+#endif
diff --git a/mert/HypergraphTest.cpp b/mert/HypergraphTest.cpp
new file mode 100644
index 000000000..345a445f0
--- /dev/null
+++ b/mert/HypergraphTest.cpp
@@ -0,0 +1,151 @@
+#include <iostream>
+
+#define BOOST_TEST_MODULE MertForestRescore
+#include <boost/test/unit_test.hpp>
+
+#include "Hypergraph.h"
+
+using namespace std;
+using namespace MosesTuning;
+
+BOOST_AUTO_TEST_CASE(prune)
+{
+ Vocab vocab;
+ WordVec words;
+ string wordStrings[] =
+ {"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
+ for (size_t i = 0; i < 13; ++i) {
+ words.push_back(&(vocab.FindOrAdd((wordStrings[i]))));
+ }
+
+ const string f1 = "foo";
+ const string f2 = "bar";
+ Graph graph(vocab);
+ graph.SetCounts(5,8);
+
+ Edge* e0 = graph.NewEdge();
+ e0->AddWord(words[0]);
+
+ Vertex* v0 = graph.NewVertex();
+ v0->AddEdge(e0);
+
+ Edge* e1 = graph.NewEdge();
+ e1->AddWord(NULL);
+ e1->AddChild(0);
+ e1->AddWord(words[2]);
+ e1->AddWord(words[3]);
+ e1->AddFeature(f1,1);
+ e1->AddFeature(f2,1);
+ Edge* e5 = graph.NewEdge();
+ e5->AddWord(NULL);
+ e5->AddChild(0);
+ e5->AddWord(words[9]);
+ e5->AddWord(words[10]);
+ e5->AddFeature(f1,2);
+ e5->AddFeature(f2,-2);
+
+ Vertex* v1 = graph.NewVertex();
+ v1->AddEdge(e1);
+ v1->AddEdge(e5);
+ v1->SetSourceCovered(1);
+
+ Edge* e2 = graph.NewEdge();
+ e2->AddWord(NULL);
+ e2->AddChild(1);
+ e2->AddWord(words[4]);
+ e2->AddWord(words[5]);
+ e2->AddFeature(f2,3);
+
+ Vertex* v2 = graph.NewVertex();
+ v2->AddEdge(e2);
+ v2->SetSourceCovered(3);
+
+ Edge* e3 = graph.NewEdge();
+ e3->AddWord(NULL);
+ e3->AddChild(2);
+ e3->AddWord(words[6]);
+ e3->AddWord(words[7]);
+ e3->AddWord(words[8]);
+ e3->AddFeature(f1,1);
+ Edge* e6 = graph.NewEdge();
+ e6->AddWord(NULL);
+ e6->AddChild(2);
+ e6->AddWord(words[9]);
+ e6->AddWord(words[12]);
+ e6->AddFeature(f2,1);
+ Edge* e7 = graph.NewEdge();
+ e7->AddWord(NULL);
+ e7->AddChild(1);
+ e7->AddWord(words[11]);
+ e7->AddWord(words[12]);
+ e7->AddFeature(f1,2);
+ e7->AddFeature(f2,3);
+
+ Vertex* v3 = graph.NewVertex();
+ v3->AddEdge(e3);
+ v3->AddEdge(e6);
+ v3->AddEdge(e7);
+ v3->SetSourceCovered(5);
+
+ Edge* e4 = graph.NewEdge();
+ e4->AddWord(NULL);
+ e4->AddChild(3);
+ e4->AddWord(words[1]);
+
+ Vertex* v4 = graph.NewVertex();
+ v4->AddEdge(e4);
+ v4->SetSourceCovered(6);
+
+ SparseVector weights;
+ weights.set(f1,2);
+ weights.set(f2,1);
+
+ Graph pruned(vocab);
+ graph.Prune(&pruned, weights, 5);
+
+ BOOST_CHECK_EQUAL(5, pruned.EdgeSize());
+ BOOST_CHECK_EQUAL(4, pruned.VertexSize());
+
+ //edges retained should be best path (<s> ab jk </s>) and hi
+ BOOST_CHECK_EQUAL(1, pruned.GetVertex(0).GetIncoming().size());
+ BOOST_CHECK_EQUAL(2, pruned.GetVertex(1).GetIncoming().size());
+ BOOST_CHECK_EQUAL(1, pruned.GetVertex(2).GetIncoming().size());
+ BOOST_CHECK_EQUAL(1, pruned.GetVertex(3).GetIncoming().size());
+
+ const Edge* edge;
+
+ edge = pruned.GetVertex(0).GetIncoming()[0];
+ BOOST_CHECK_EQUAL(1, edge->Words().size());
+ BOOST_CHECK_EQUAL(words[0], edge->Words()[0]);
+
+ edge = pruned.GetVertex(1).GetIncoming()[0];
+ BOOST_CHECK_EQUAL(3, edge->Words().size());
+ BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]);
+ BOOST_CHECK_EQUAL(words[2]->first, edge->Words()[1]->first);
+ BOOST_CHECK_EQUAL(words[3]->first, edge->Words()[2]->first);
+
+ edge = pruned.GetVertex(1).GetIncoming()[1];
+ BOOST_CHECK_EQUAL(3, edge->Words().size());
+ BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]);
+ BOOST_CHECK_EQUAL(words[9]->first, edge->Words()[1]->first);
+ BOOST_CHECK_EQUAL(words[10]->first, edge->Words()[2]->first);
+
+ edge = pruned.GetVertex(2).GetIncoming()[0];
+ BOOST_CHECK_EQUAL(3, edge->Words().size());
+ BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]);
+ BOOST_CHECK_EQUAL(words[11]->first, edge->Words()[1]->first);
+ BOOST_CHECK_EQUAL(words[12]->first, edge->Words()[2]->first);
+
+ edge = pruned.GetVertex(3).GetIncoming()[0];
+ BOOST_CHECK_EQUAL(2, edge->Words().size());
+ BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]);
+ BOOST_CHECK_EQUAL(words[1]->first, edge->Words()[1]->first);
+
+
+
+
+
+// BOOST_CHECK_EQUAL(words[0], pruned.GetVertex(0).GetIncoming()[0].Words()[0]);
+
+
+}
diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp
index 87cec9211..ea4240472 100644
--- a/mert/InterpolatedScorer.cpp
+++ b/mert/InterpolatedScorer.cpp
@@ -153,6 +153,41 @@ void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& di
}
+/** Interpolated scorer gets a vector of sufficient statistics, calls all scorers with corresponding statistics,
+ and combines them with weights **/
+float InterpolatedScorer::calculateScore(const std::vector<ScoreStatsType>& totals) const
+{
+ size_t scorerNum = 0;
+ size_t last = 0;
+ float score = 0;
+ for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
+ itsc != m_scorers.end(); ++itsc) {
+ int numScoresScorer = (*itsc)->NumberOfScores();
+ std::vector<ScoreStatsType> totals_scorer(totals.begin()+last, totals.begin()+last+numScoresScorer);
+ score += (*itsc)->calculateScore(totals_scorer) * m_scorer_weights[scorerNum];
+ last += numScoresScorer;
+ scorerNum++;
+ }
+ return score;
+}
+
+
+float InterpolatedScorer::getReferenceLength(const std::vector<ScoreStatsType>& totals) const
+{
+ size_t scorerNum = 0;
+ size_t last = 0;
+ float refLen = 0;
+ for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
+ itsc != m_scorers.end(); ++itsc) {
+ int numScoresScorer = (*itsc)->NumberOfScores();
+ std::vector<ScoreStatsType> totals_scorer(totals.begin()+last, totals.begin()+last+numScoresScorer);
+ refLen += (*itsc)->getReferenceLength(totals_scorer) * m_scorer_weights[scorerNum];
+ last += numScoresScorer;
+ scorerNum++;
+ }
+ return refLen;
+ }
+
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
diff --git a/mert/InterpolatedScorer.h b/mert/InterpolatedScorer.h
index d1078e9e1..159abf6d4 100644
--- a/mert/InterpolatedScorer.h
+++ b/mert/InterpolatedScorer.h
@@ -39,6 +39,10 @@ public:
virtual void setScoreData(ScoreData* data);
+ virtual float calculateScore(const std::vector<ScoreStatsType>& totals) const;
+
+ virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const;
+
/**
* Set the factors, which should be used for this metric
*/
diff --git a/mert/Jamfile b/mert/Jamfile
index 34c640b06..ee8a1fcc3 100644
--- a/mert/Jamfile
+++ b/mert/Jamfile
@@ -15,6 +15,9 @@ FeatureStats.cpp
FeatureArray.cpp
FeatureData.cpp
FeatureDataIterator.cpp
+ForestRescore.cpp
+HopeFearDecoder.cpp
+Hypergraph.cpp
MiraFeatureVector.cpp
MiraWeightVector.cpp
HypPackEnumerator.cpp
@@ -26,6 +29,8 @@ SemposOverlapping.cpp
InterpolatedScorer.cpp
Point.cpp
PerScorer.cpp
+HwcmScorer.cpp
+../moses/FF/InternalTree.cpp
Scorer.cpp
ScorerFactory.cpp
Optimizer.cpp
@@ -62,13 +67,15 @@ exe sentence-bleu : sentence-bleu.cpp mert_lib ;
exe pro : pro.cpp mert_lib ..//boost_program_options ;
-exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ;
+exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ..//boost_filesystem ;
alias programs : mert extractor evaluator pro kbmira sentence-bleu ;
unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test forest_rescore_test : ForestRescoreTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test hypergraph_test : HypergraphTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test ngram_test : NgramTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test optimizer_factory_test : OptimizerFactoryTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test point_test : PointTest.cpp mert_lib ..//boost_unit_test_framework ;
diff --git a/mert/MeteorScorer.cpp b/mert/MeteorScorer.cpp
index 904e68efd..3a7eb6ab7 100644
--- a/mert/MeteorScorer.cpp
+++ b/mert/MeteorScorer.cpp
@@ -146,7 +146,7 @@ void MeteorScorer::prepareStats(size_t sid, const string& text, ScoreStats& entr
entry.set(stats_str);
}
-float MeteorScorer::calculateScore(const vector<int>& comps) const
+float MeteorScorer::calculateScore(const vector<ScoreStatsType>& comps) const
{
string score;
stringstream input;
@@ -184,7 +184,7 @@ void MeteorScorer::setReferenceFiles(const vector<string>& referenceFiles) {}
void MeteorScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) {}
-float MeteorScorer::calculateScore(const vector<int>& comps) const
+float MeteorScorer::calculateScore(const vector<ScoreStatsType>& comps) const
{
// Should never be reached
return 0.0;
diff --git a/mert/MeteorScorer.h b/mert/MeteorScorer.h
index 8260a9455..31b05ec72 100644
--- a/mert/MeteorScorer.h
+++ b/mert/MeteorScorer.h
@@ -54,7 +54,7 @@ public:
return 23;
}
- virtual float calculateScore(const std::vector<int>& comps) const;
+ virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const;
private:
// Meteor and process IO
diff --git a/mert/MiraFeatureVector.cpp b/mert/MiraFeatureVector.cpp
index dea9b9b83..347ad488e 100644
--- a/mert/MiraFeatureVector.cpp
+++ b/mert/MiraFeatureVector.cpp
@@ -9,18 +9,17 @@ namespace MosesTuning
{
-MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
- : m_dense(vec.dense)
-{
- vector<size_t> sparseFeats = vec.sparse.feats();
+void MiraFeatureVector::InitSparse(const SparseVector& sparse, size_t ignoreLimit) {
+ vector<size_t> sparseFeats = sparse.feats();
bool bFirst = true;
size_t lastFeat = 0;
m_sparseFeats.reserve(sparseFeats.size());
m_sparseVals.reserve(sparseFeats.size());
for(size_t i=0; i<sparseFeats.size(); i++) {
+ if (sparseFeats[i] < ignoreLimit) continue;
size_t feat = m_dense.size() + sparseFeats[i];
m_sparseFeats.push_back(feat);
- m_sparseVals.push_back(vec.sparse.get(sparseFeats[i]));
+ m_sparseVals.push_back(sparse.get(sparseFeats[i]));
// Check ordered property
if(bFirst) {
@@ -35,6 +34,21 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
}
}
+MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
+ : m_dense(vec.dense)
+{
+ InitSparse(vec.sparse);
+}
+
+MiraFeatureVector::MiraFeatureVector(const SparseVector& sparse, size_t num_dense) {
+ m_dense.resize(num_dense);
+ //Assume that features with id [0,num_dense) are the dense features
+ for (size_t id = 0; id < num_dense; ++id) {
+ m_dense[id] = sparse.get(id);
+ }
+ InitSparse(sparse,num_dense);
+}
+
MiraFeatureVector::MiraFeatureVector(const MiraFeatureVector& other)
: m_dense(other.m_dense),
m_sparseFeats(other.m_sparseFeats),
@@ -148,6 +162,22 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
return MiraFeatureVector(dense,sparseFeats,sparseVals);
}
+bool operator==(const MiraFeatureVector& a,const MiraFeatureVector& b) {
+ ValType eps = 1e-8;
+ //dense features
+ if (a.m_dense.size() != b.m_dense.size()) return false;
+ for (size_t i = 0; i < a.m_dense.size(); ++i) {
+ if (fabs(a.m_dense[i]-b.m_dense[i]) < eps) return false;
+ }
+ if (a.m_sparseFeats.size() != b.m_sparseFeats.size()) return false;
+ for (size_t i = 0; i < a.m_sparseFeats.size(); ++i) {
+ if (a.m_sparseFeats[i] != b.m_sparseFeats[i]) return false;
+ if (fabs(a.m_sparseVals[i] != b.m_sparseVals[i])) return false;
+ }
+ return true;
+
+}
+
ostream& operator<<(ostream& o, const MiraFeatureVector& e)
{
for(size_t i=0; i<e.size(); i++) {
diff --git a/mert/MiraFeatureVector.h b/mert/MiraFeatureVector.h
index cb2b1c87d..48aa496b5 100644
--- a/mert/MiraFeatureVector.h
+++ b/mert/MiraFeatureVector.h
@@ -26,7 +26,10 @@ typedef FeatureStatsType ValType;
class MiraFeatureVector
{
public:
+ MiraFeatureVector() {}
MiraFeatureVector(const FeatureDataItem& vec);
+ //Assumes that features in sparse with id < num_dense are dense features
+ MiraFeatureVector(const SparseVector& sparse, size_t num_dense);
MiraFeatureVector(const MiraFeatureVector& other);
MiraFeatureVector(const std::vector<ValType>& dense,
const std::vector<std::size_t>& sparseFeats,
@@ -42,7 +45,12 @@ public:
friend std::ostream& operator<<(std::ostream& o, const MiraFeatureVector& e);
+ friend bool operator==(const MiraFeatureVector& a,const MiraFeatureVector& b);
+
private:
+ //Ignore any sparse features with id < ignoreLimit
+ void InitSparse(const SparseVector& sparse, size_t ignoreLimit = 0);
+
std::vector<ValType> m_dense;
std::vector<std::size_t> m_sparseFeats;
std::vector<ValType> m_sparseVals;
diff --git a/mert/MiraWeightVector.cpp b/mert/MiraWeightVector.cpp
index e23804cbf..c8a1ca774 100644
--- a/mert/MiraWeightVector.cpp
+++ b/mert/MiraWeightVector.cpp
@@ -93,6 +93,14 @@ void MiraWeightVector::update(size_t index, ValType delta)
m_lastUpdated[index] = m_numUpdates;
}
+void MiraWeightVector::ToSparse(SparseVector* sparse) const {
+ for (size_t i = 0; i < m_weights.size(); ++i) {
+ if(abs(m_weights[i])>1e-8) {
+ sparse->set(i,m_weights[i]);
+ }
+ }
+}
+
/**
* Make sure everyone's total is up-to-date
*/
@@ -131,7 +139,7 @@ ostream& operator<<(ostream& o, const MiraWeightVector& e)
for(size_t i=0; i<e.m_weights.size(); i++) {
if(abs(e.m_weights[i])>1e-8) {
if(i>0) o << " ";
- cerr << i << ":" << e.m_weights[i];
+ o << i << ":" << e.m_weights[i];
}
}
return o;
@@ -163,6 +171,15 @@ size_t AvgWeightVector::size() const
return m_wv.m_weights.size();
}
+void AvgWeightVector::ToSparse(SparseVector* sparse) const {
+ for (size_t i = 0; i < size(); ++i) {
+ ValType w = weight(i);
+ if(abs(w)>1e-8) {
+ sparse->set(i,w);
+ }
+ }
+}
+
// --Emacs trickery--
// Local Variables:
// mode:c++
diff --git a/mert/MiraWeightVector.h b/mert/MiraWeightVector.h
index eb27e8a6d..bbc28704b 100644
--- a/mert/MiraWeightVector.h
+++ b/mert/MiraWeightVector.h
@@ -63,6 +63,11 @@ public:
*/
AvgWeightVector avg();
+ /**
+ * Convert to sparse vector, interpreting all features as sparse.
+ **/
+ void ToSparse(SparseVector* sparse) const;
+
friend class AvgWeightVector;
friend std::ostream& operator<<(std::ostream& o, const MiraWeightVector& e);
@@ -99,12 +104,12 @@ public:
ValType score(const MiraFeatureVector& fv) const;
ValType weight(std::size_t index) const;
std::size_t size() const;
+ void ToSparse(SparseVector* sparse) const;
private:
const MiraWeightVector& m_wv;
};
-#endif // MERT_WEIGHT_VECTOR_H
// --Emacs trickery--
// Local Variables:
@@ -113,3 +118,4 @@ private:
// End:
}
+#endif // MERT_WEIGHT_VECTOR_H
diff --git a/mert/PerScorer.cpp b/mert/PerScorer.cpp
index 3e157a55e..69bf257fe 100644
--- a/mert/PerScorer.cpp
+++ b/mert/PerScorer.cpp
@@ -79,10 +79,10 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
entry.set(stats_str);
}
-float PerScorer::calculateScore(const vector<int>& comps) const
+float PerScorer::calculateScore(const vector<ScoreStatsType>& comps) const
{
float denom = comps[2];
- float num = comps[0] - max(0,comps[1]-comps[2]);
+ float num = comps[0] - max(0.0f,comps[1]-comps[2]);
if (denom == 0) {
// This shouldn't happen!
return 0.0;
diff --git a/mert/PerScorer.h b/mert/PerScorer.h
index ffb869942..0cc7f8a9a 100644
--- a/mert/PerScorer.h
+++ b/mert/PerScorer.h
@@ -30,7 +30,7 @@ public:
virtual std::size_t NumberOfScores() const {
return 3;
}
- virtual float calculateScore(const std::vector<int>& comps) const;
+ virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const;
private:
// no copying allowed
diff --git a/mert/PermutationScorer.cpp b/mert/PermutationScorer.cpp
index aec389c27..b35016c77 100644
--- a/mert/PermutationScorer.cpp
+++ b/mert/PermutationScorer.cpp
@@ -234,7 +234,7 @@ void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats&
}
//Will just be final score
-statscore_t PermutationScorer::calculateScore(const vector<int>& comps) const
+statscore_t PermutationScorer::calculateScore(const vector<ScoreStatsType>& comps) const
{
//cerr << "*******PermutationScorer::calculateScore" ;
//cerr << " " << comps[0]/comps[1] << endl;
diff --git a/mert/PermutationScorer.h b/mert/PermutationScorer.h
index c3d0cc960..500ef0d8f 100644
--- a/mert/PermutationScorer.h
+++ b/mert/PermutationScorer.h
@@ -49,7 +49,7 @@ public:
protected:
- statscore_t calculateScore(const std::vector<int>& scores) const;
+ statscore_t calculateScore(const std::vector<ScoreStatsType>& scores) const;
PermutationScorer(const PermutationScorer&);
~PermutationScorer() {};
PermutationScorer& operator=(const PermutationScorer&);
diff --git a/mert/ScoreData.h b/mert/ScoreData.h
index 9159e029f..5e96a2e06 100644
--- a/mert/ScoreData.h
+++ b/mert/ScoreData.h
@@ -13,6 +13,7 @@
#include <vector>
#include <stdexcept>
#include <string>
+#include <boost/lexical_cast.hpp>
#include "ScoreArray.h"
#include "ScoreStats.h"
@@ -108,7 +109,7 @@ public:
inline int getName(std::size_t idx) const {
idx2name::const_iterator i = m_index_to_array_name.find(idx);
if (i != m_index_to_array_name.end())
- throw std::runtime_error("there is no entry at index " + idx);
+ throw std::runtime_error("there is no entry at index " + boost::lexical_cast<std::string>(idx));
return i->second;
}
};
diff --git a/mert/Scorer.h b/mert/Scorer.h
index 0424398eb..8c468aff2 100644
--- a/mert/Scorer.h
+++ b/mert/Scorer.h
@@ -43,6 +43,19 @@ public:
virtual std::size_t NumberOfScores() const = 0;
/**
+ * Calculate score based on a vector of sufficient statistics.
+ */
+ virtual float calculateScore(const std::vector<ScoreStatsType>& totals) const = 0;
+
+ float calculateSentenceLevelBackgroundScore(const std::vector<ScoreStatsType>& totals, const std::vector<ScoreStatsType>& bg) {
+ std::vector<ScoreStatsType> stats(totals.size());
+ for(size_t i=0; i<stats.size(); i++)
+ stats[i] = totals[i]+bg[i];
+ // Get score and scale by reference length (as per Chiang et al 08)
+ return calculateScore(stats) * getReferenceLength(stats);
+ }
+
+ /**
* Set the reference files. This must be called before prepareStats().
*/
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles) {
@@ -98,6 +111,11 @@ public:
}
/**
+ * Based on vector of sufficient statistics, return length of reference.
+ */
+ virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const = 0;
+
+ /**
* Set the score data, prior to scoring.
*/
virtual void setScoreData(ScoreData* data) {
diff --git a/mert/ScorerFactory.cpp b/mert/ScorerFactory.cpp
index 9b7cc4af0..02573091c 100644
--- a/mert/ScorerFactory.cpp
+++ b/mert/ScorerFactory.cpp
@@ -11,6 +11,7 @@
#include "SemposScorer.h"
#include "PermutationScorer.h"
#include "MeteorScorer.h"
+#include "HwcmScorer.h"
#include "Reference.h"
using namespace std;
@@ -32,6 +33,7 @@ vector<string> ScorerFactory::getTypes()
types.push_back(string("SEMPOS"));
types.push_back(string("LRSCORE"));
types.push_back(string("METEOR"));
+ types.push_back(string("HWCM"));
return types;
}
@@ -56,6 +58,8 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config)
return (PermutationScorer*) new PermutationScorer(type, config);
} else if (type == "METEOR") {
return new MeteorScorer(config);
+ } else if (type == "HWCM") {
+ return new HwcmScorer(config);
} else {
if (type.find(',') != string::npos) {
return new InterpolatedScorer(type, config);
diff --git a/mert/SemposOverlapping.cpp b/mert/SemposOverlapping.cpp
index 718bc7f26..39a4136e8 100644
--- a/mert/SemposOverlapping.cpp
+++ b/mert/SemposOverlapping.cpp
@@ -33,9 +33,9 @@ void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr)
g_overlapping = ovr;
}
-vector<int> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
+vector<ScoreStatsType> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
{
- vector<int> stats(2);
+ vector<ScoreStatsType> stats(2);
sentence_t intersection;
set_intersection(cand.begin(), cand.end(), ref.begin(), ref.end(),
@@ -53,12 +53,12 @@ vector<int> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sent
refSum += semposScorer->weight(it->first);
}
- stats[0] = (int)(multCoeff * interSum);
- stats[1] = (int)(multCoeff * refSum);
+ stats[0] = (ScoreStatsType)(multCoeff * interSum);
+ stats[1] = (ScoreStatsType)(multCoeff * refSum);
return stats;
}
-float CapMicroOverlapping::calculateScore(const vector<int>& stats) const
+float CapMicroOverlapping::calculateScore(const vector<ScoreStatsType>& stats) const
{
if (stats.size() != 2) {
throw std::runtime_error("Size of stats vector has to be 2");
@@ -67,9 +67,9 @@ float CapMicroOverlapping::calculateScore(const vector<int>& stats) const
return stats[0] / static_cast<float>(stats[1]);
}
-vector<int> CapMacroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
+vector<ScoreStatsType> CapMacroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
{
- vector<int> stats(2 * kMaxNOC);
+ vector<ScoreStatsType> stats(2 * kMaxNOC);
sentence_t intersection;
set_intersection(cand.begin(), cand.end(), ref.begin(), ref.end(),
@@ -92,7 +92,7 @@ vector<int> CapMacroOverlapping::prepareStats(const sentence_t& cand, const sent
return stats;
}
-float CapMacroOverlapping::calculateScore(const vector<int>& stats) const
+float CapMacroOverlapping::calculateScore(const vector<ScoreStatsType>& stats) const
{
if (stats.size() != 2 * kMaxNOC) {
// TODO: Add some comments. The number "38" looks like a magic number.
diff --git a/mert/SemposOverlapping.h b/mert/SemposOverlapping.h
index 5eddbaef3..88409f77a 100644
--- a/mert/SemposOverlapping.h
+++ b/mert/SemposOverlapping.h
@@ -7,6 +7,8 @@
#include <utility>
#include <vector>
+#include "Types.h"
+
namespace MosesTuning
{
@@ -31,8 +33,8 @@ class SemposOverlapping
{
public:
virtual ~SemposOverlapping() {}
- virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref) = 0;
- virtual float calculateScore(const std::vector<int>& stats) const = 0;
+ virtual std::vector<ScoreStatsType> prepareStats(const sentence_t& cand, const sentence_t& ref) = 0;
+ virtual float calculateScore(const std::vector<ScoreStatsType>& stats) const = 0;
virtual std::size_t NumberOfScores() const = 0;
};
@@ -61,8 +63,8 @@ public:
CapMicroOverlapping(const SemposScorer* sempos) : semposScorer(sempos) {}
~CapMicroOverlapping() {}
- virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
- virtual float calculateScore(const std::vector<int>& stats) const;
+ virtual std::vector<ScoreStatsType> prepareStats(const sentence_t& cand, const sentence_t& ref);
+ virtual float calculateScore(const std::vector<ScoreStatsType>& stats) const;
virtual std::size_t NumberOfScores() const {
return 2;
}
@@ -83,8 +85,8 @@ public:
CapMacroOverlapping(const SemposScorer* sempos) : semposScorer(sempos) {}
~CapMacroOverlapping() {}
- virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
- virtual float calculateScore(const std::vector<int>& stats) const;
+ virtual std::vector<ScoreStatsType> prepareStats(const sentence_t& cand, const sentence_t& ref);
+ virtual float calculateScore(const std::vector<ScoreStatsType>& stats) const;
virtual std::size_t NumberOfScores() const {
return kMaxNOC * 2;
}
diff --git a/mert/SemposScorer.h b/mert/SemposScorer.h
index b6c735bbe..434822355 100644
--- a/mert/SemposScorer.h
+++ b/mert/SemposScorer.h
@@ -35,7 +35,7 @@ public:
virtual std::size_t NumberOfScores() const {
return m_ovr->NumberOfScores();
}
- virtual float calculateScore(const std::vector<int>& comps) const {
+ virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const {
return m_ovr->calculateScore(comps);
}
diff --git a/mert/StatisticsBasedScorer.cpp b/mert/StatisticsBasedScorer.cpp
index 869e2f55a..6c1e0c8c3 100644
--- a/mert/StatisticsBasedScorer.cpp
+++ b/mert/StatisticsBasedScorer.cpp
@@ -67,7 +67,7 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
throw runtime_error("No candidates supplied");
}
int numCounts = m_score_data->get(0,candidates[0]).size();
- vector<int> totals(numCounts);
+ vector<ScoreStatsType> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
ScoreStats stats = m_score_data->get(i,candidates[i]);
if (stats.size() != totals.size()) {
diff --git a/mert/StatisticsBasedScorer.h b/mert/StatisticsBasedScorer.h
index 644873b60..f1c77e0ba 100644
--- a/mert/StatisticsBasedScorer.h
+++ b/mert/StatisticsBasedScorer.h
@@ -11,6 +11,8 @@
#include "Scorer.h"
+#include "util/exception.hh"
+
namespace MosesTuning
{
@@ -21,6 +23,8 @@ namespace MosesTuning
*/
class StatisticsBasedScorer : public Scorer
{
+friend class HopeFearDecoder;
+
public:
StatisticsBasedScorer(const std::string& name, const std::string& config);
virtual ~StatisticsBasedScorer() {}
@@ -38,7 +42,12 @@ protected:
/**
* Calculate the actual score.
*/
- virtual statscore_t calculateScore(const std::vector<int>& totals) const = 0;
+ virtual statscore_t calculateScore(const std::vector<ScoreStatsType>& totals) const = 0;
+
+ virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const {
+ UTIL_THROW(util::Exception, "getReferenceLength not implemented for this scorer type.");
+ return 0;
+ }
// regularisation
RegularisationType m_regularization_type;
diff --git a/mert/TER/alignmentStruct.cpp b/mert/TER/alignmentStruct.cpp
index 15b4a8032..544ee61ac 100644
--- a/mert/TER/alignmentStruct.cpp
+++ b/mert/TER/alignmentStruct.cpp
@@ -1,17 +1,37 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "alignmentStruct.h"
using namespace std;
namespace TERCpp
{
-string alignmentStruct::toString()
-{
- stringstream s;
+ string alignmentStruct::toString()
+ {
+ stringstream s;
// s << "nword : " << vectorToString(nwords)<<endl;
// s << "alignment" << vectorToString(alignment)<<endl;
// s << "afterShift" << vectorToString(alignment)<<endl;
- s << "Nothing to be printed" <<endl;
- return s.str();
-}
+ s << "Nothing to be printed" <<endl;
+ return s.str();
+ }
// alignmentStruct::alignmentStruct()
// {
@@ -79,7 +99,7 @@ string alignmentStruct::toString()
// return s.str();
// }
-/* The distance of the shift. */
+ /* The distance of the shift. */
// int alignmentStruct::distance()
// {
// if (moveto < start)
diff --git a/mert/TER/alignmentStruct.h b/mert/TER/alignmentStruct.h
index 9e9a75468..adda2c345 100644
--- a/mert/TER/alignmentStruct.h
+++ b/mert/TER/alignmentStruct.h
@@ -1,5 +1,26 @@
-#ifndef MERT_TER_ALIGNMENT_STRUCT_H_
-#define MERT_TER_ALIGNMENT_STRUCT_H_
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
+#ifndef MERT_TER_ALIGNMENTSTRUCT_H_
+#define MERT_TER_ALIGNMENTSTRUCT_H_
+
#include <vector>
#include <stdio.h>
@@ -7,15 +28,16 @@
#include <sstream>
#include "tools.h"
+
using namespace std;
using namespace Tools;
namespace TERCpp
{
-class alignmentStruct
-{
-private:
-public:
+ class alignmentStruct
+ {
+ private:
+ public:
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
@@ -31,15 +53,14 @@ public:
// int end;
// int moveto;
// int newloc;
- vector<string> nwords; // The words we shifted
- vector<char> alignment ; // for pra_more output
- vector<vecInt> aftershift; // for pra_more output
- // This is used to store the cost of a shift, so we don't have to
- // calculate it multiple times.
- double cost;
- string toString();
-};
+ vector<string> nwords; // The words we shifted
+ vector<char> alignment ; // for pra_more output
+ vector<vecInt> aftershift; // for pra_more output
+ // This is used to store the cost of a shift, so we don't have to
+ // calculate it multiple times.
+ double cost;
+ string toString();
+ };
}
-
-#endif // MERT_TER_ALIGNMENT_STRUCT_H_
+#endif \ No newline at end of file
diff --git a/mert/TER/bestShiftStruct.h b/mert/TER/bestShiftStruct.h
index bfebe3b1e..9457fd1d8 100644
--- a/mert/TER/bestShiftStruct.h
+++ b/mert/TER/bestShiftStruct.h
@@ -1,5 +1,26 @@
-#ifndef MERT_TER_BEST_SHIFT_STRUCT_H_
-#define MERT_TER_BEST_SHIFT_STRUCT_H_
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
+#ifndef __BESTSHIFTSTRUCT_H_
+#define __BESTSHIFTSTRUCT_H_
+
#include <vector>
#include <stdio.h>
@@ -15,10 +36,10 @@ using namespace Tools;
namespace TERCpp
{
-class bestShiftStruct
-{
-private:
-public:
+ class bestShiftStruct
+ {
+ private:
+ public:
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
@@ -34,17 +55,16 @@ public:
// int end;
// int moveto;
// int newloc;
- terShift m_best_shift;
- terAlignment m_best_align;
- bool m_empty;
+ terShift m_best_shift;
+ terAlignment m_best_align;
+ bool m_empty;
// vector<string> nwords; // The words we shifted
// char* alignment ; // for pra_more output
// vector<vecInt> aftershift; // for pra_more output
- // This is used to store the cost of a shift, so we don't have to
- // calculate it multiple times.
+ // This is used to store the cost of a shift, so we don't have to
+ // calculate it multiple times.
// double cost;
-};
+ };
}
-
-#endif // MERT_TER_BEST_SHIFT_STRUCT_H_
+#endif \ No newline at end of file
diff --git a/mert/TER/hashMap.cpp b/mert/TER/hashMap.cpp
index 469167aaa..de84ff796 100644
--- a/mert/TER/hashMap.cpp
+++ b/mert/TER/hashMap.cpp
@@ -1,3 +1,23 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "hashMap.h"
// The following class defines a hash function for strings
@@ -8,142 +28,156 @@ using namespace std;
namespace HashMapSpace
{
// hashMap::hashMap();
-/* hashMap::~hashMap()
+ /* hashMap::~hashMap()
+ {
+ // vector<stringHasher>::const_iterator del = m_hasher.begin();
+ for ( vector<stringHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
+ {
+ delete(*del);
+ }
+ }*/
+ /**
+ * int hashMap::trouve ( long searchKey )
+ * @param searchKey
+ * @return
+ */
+ int hashMap::trouve ( long searchKey )
{
-// vector<stringHasher>::const_iterator del = m_hasher.begin();
- for ( vector<stringHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
- {
- delete(*del);
- }
- }*/
-/**
- * int hashMap::trouve ( long searchKey )
- * @param searchKey
- * @return
- */
-int hashMap::trouve ( long searchKey )
-{
- long foundKey;
+ long foundKey;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- return 1;
+ for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ return 1;
+ }
+ }
+ return 0;
}
- }
- return 0;
-}
-int hashMap::trouve ( string key )
-{
- long searchKey=hashValue ( key );
- long foundKey;;
+ int hashMap::trouve ( string key )
+ {
+ long searchKey=hashValue ( key );
+ long foundKey;;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- return 1;
+ for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ return 1;
+ }
+ }
+ return 0;
}
- }
- return 0;
-}
-/**
- * long hashMap::hashValue ( string key )
- * @param key
- * @return
- */
-long hashMap::hashValue ( string key )
-{
- locale loc; // the "C" locale
- const collate<char>& coll = use_facet<collate<char> >(loc);
- return coll.hash(key.data(),key.data()+key.length());
+ /**
+ * long hashMap::hashValue ( string key )
+ * @param key
+ * @return
+ */
+ long hashMap::hashValue ( string key )
+ {
+ locale loc; // the "C" locale
+ const collate<char>& coll = use_facet<collate<char> >(loc);
+ return coll.hash(key.data(),key.data()+key.length());
// boost::hash<string> hasher;
// return hasher ( key );
-}
-/**
- * void hashMap::addHasher ( string key, string value )
- * @param key
- * @param value
- */
-void hashMap::addHasher ( string key, string value )
-{
- if ( trouve ( hashValue ( key ) ) ==0 ) {
+ }
+ /**
+ * void hashMap::addHasher ( string key, string value )
+ * @param key
+ * @param value
+ */
+ void hashMap::addHasher ( string key, string value )
+ {
+ if ( trouve ( hashValue ( key ) ) ==0 )
+ {
// cerr << "ICI1" <<endl;
- stringHasher H ( hashValue ( key ),key,value );
+ stringHasher H ( hashValue ( key ),key,value );
// cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
// cerr << "ICI2" <<endl;
- m_hasher.push_back ( H );
- }
-}
-stringHasher hashMap::getHasher ( string key )
-{
- long searchKey=hashValue ( key );
- long foundKey;
- stringHasher defaut(0,"","");
+ m_hasher.push_back ( H );
+ }
+ }
+ stringHasher hashMap::getHasher ( string key )
+ {
+ long searchKey=hashValue ( key );
+ long foundKey;
+ stringHasher defaut(0,"","");
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- return ( *l_hasher );
+ for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ return ( *l_hasher );
+ }
+ }
+ return defaut;
}
- }
- return defaut;
-}
-string hashMap::getValue ( string key )
-{
- long searchKey=hashValue ( key );
- long foundKey;
+ string hashMap::getValue ( string key )
+ {
+ long searchKey=hashValue ( key );
+ long foundKey;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
+ for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
// cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
- return ( *l_hasher ).getValue();
+ return ( *l_hasher ).getValue();
+ }
+ }
+ return "";
}
- }
- return "";
-}
-string hashMap::searchValue ( string value )
-{
+ string hashMap::searchValue ( string value )
+ {
// long searchKey=hashValue ( key );
// long foundKey;
- string foundValue;
+ string foundValue;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundValue= ( *l_hasher ).getValue();
- if ( foundValue.compare ( value ) == 0 ) {
- return ( *l_hasher ).getKey();
+ for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundValue= ( *l_hasher ).getValue();
+ if ( foundValue.compare ( value ) == 0 )
+ {
+ return ( *l_hasher ).getKey();
+ }
+ }
+ return "";
}
- }
- return "";
-}
-void hashMap::setValue ( string key , string value )
-{
- long searchKey=hashValue ( key );
- long foundKey;
+ void hashMap::setValue ( string key , string value )
+ {
+ long searchKey=hashValue ( key );
+ long foundKey;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- ( *l_hasher ).setValue ( value );
+ for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ ( *l_hasher ).setValue ( value );
// return ( *l_hasher ).getValue();
+ }
+ }
}
- }
-}
-/**
- *
- */
-void hashMap::printHash()
-{
- for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
- }
-}
+ /**
+ *
+ */
+ void hashMap::printHash()
+ {
+ for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
+ }
+ }
diff --git a/mert/TER/hashMap.h b/mert/TER/hashMap.h
index 85020d041..6cb721573 100644
--- a/mert/TER/hashMap.h
+++ b/mert/TER/hashMap.h
@@ -1,10 +1,29 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
/*
* Generic hashmap manipulation functions
*/
-
-#ifndef MERT_TER_HASHMAP_H_
-#define MERT_TER_HASHMAP_H_
-
+#ifndef __HASHMAP_H_
+#define __HASHMAP_H_
+#include <boost/functional/hash.hpp>
#include "stringHasher.h"
#include <vector>
#include <string>
@@ -16,28 +35,30 @@ using namespace std;
namespace HashMapSpace
{
-class hashMap
-{
-private:
- vector<stringHasher> m_hasher;
+ class hashMap
+ {
+ private:
+ vector<stringHasher> m_hasher;
-public:
+ public:
// ~hashMap();
- long hashValue ( string key );
- int trouve ( long searchKey );
- int trouve ( string key );
- void addHasher ( string key, string value );
- stringHasher getHasher ( string key );
- string getValue ( string key );
- string searchValue ( string key );
- void setValue ( string key , string value );
- void printHash();
- vector<stringHasher> getHashMap();
- string printStringHash();
- string printStringHash2();
- string printStringHashForLexicon();
-};
+ long hashValue ( string key );
+ int trouve ( long searchKey );
+ int trouve ( string key );
+ void addHasher ( string key, string value );
+ stringHasher getHasher ( string key );
+ string getValue ( string key );
+ string searchValue ( string key );
+ void setValue ( string key , string value );
+ void printHash();
+ vector<stringHasher> getHashMap();
+ string printStringHash();
+ string printStringHash2();
+ string printStringHashForLexicon();
+ };
+
}
-#endif // MERT_TER_HASHMAP_H_
+
+#endif
diff --git a/mert/TER/hashMapInfos.cpp b/mert/TER/hashMapInfos.cpp
index 9cd431196..23f57d808 100644
--- a/mert/TER/hashMapInfos.cpp
+++ b/mert/TER/hashMapInfos.cpp
@@ -1,3 +1,23 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "hashMapInfos.h"
// The following class defines a hash function for strings
@@ -8,108 +28,117 @@ using namespace std;
namespace HashMapSpace
{
// hashMapInfos::hashMap();
-/* hashMapInfos::~hashMap()
+ /* hashMapInfos::~hashMap()
+ {
+ // vector<infosHasher>::const_iterator del = m_hasher.begin();
+ for ( vector<infosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
+ {
+ delete(*del);
+ }
+ }*/
+ /**
+ * int hashMapInfos::trouve ( long searchKey )
+ * @param searchKey
+ * @return
+ */
+ int hashMapInfos::trouve ( long searchKey )
{
-// vector<infosHasher>::const_iterator del = m_hasher.begin();
- for ( vector<infosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
- {
- delete(*del);
- }
- }*/
-/**
- * int hashMapInfos::trouve ( long searchKey )
- * @param searchKey
- * @return
- */
-int hashMapInfos::trouve ( long searchKey )
-{
- long foundKey;
+ long foundKey;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- return 1;
+ for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ return 1;
+ }
+ }
+ return 0;
}
- }
- return 0;
-}
-int hashMapInfos::trouve ( string key )
-{
- long searchKey=hashValue ( key );
- long foundKey;;
+ int hashMapInfos::trouve ( string key )
+ {
+ long searchKey=hashValue ( key );
+ long foundKey;;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- return 1;
+ for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ return 1;
+ }
+ }
+ return 0;
}
- }
- return 0;
-}
-/**
- * long hashMapInfos::hashValue ( string key )
- * @param key
- * @return
- */
-long hashMapInfos::hashValue ( string key )
-{
- locale loc; // the "C" locale
- const collate<char>& coll = use_facet<collate<char> >(loc);
- return coll.hash(key.data(),key.data()+key.length());
+ /**
+ * long hashMapInfos::hashValue ( string key )
+ * @param key
+ * @return
+ */
+ long hashMapInfos::hashValue ( string key )
+ {
+ locale loc; // the "C" locale
+ const collate<char>& coll = use_facet<collate<char> >(loc);
+ return coll.hash(key.data(),key.data()+key.length());
// boost::hash<string> hasher;
// return hasher ( key );
-}
-/**
- * void hashMapInfos::addHasher ( string key, string value )
- * @param key
- * @param value
- */
-void hashMapInfos::addHasher ( string key, vector<int> value )
-{
- if ( trouve ( hashValue ( key ) ) ==0 ) {
+ }
+ /**
+ * void hashMapInfos::addHasher ( string key, string value )
+ * @param key
+ * @param value
+ */
+ void hashMapInfos::addHasher ( string key, vector<int> value )
+ {
+ if ( trouve ( hashValue ( key ) ) ==0 )
+ {
// cerr << "ICI1" <<endl;
- infosHasher H ( hashValue ( key ),key,value );
+ infosHasher H ( hashValue ( key ),key,value );
// cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
// cerr << "ICI2" <<endl;
- m_hasher.push_back ( H );
- }
-}
-void hashMapInfos::addValue ( string key, vector<int> value )
-{
- addHasher ( key, value );
-}
-infosHasher hashMapInfos::getHasher ( string key )
-{
- long searchKey=hashValue ( key );
- long foundKey;
+ m_hasher.push_back ( H );
+ }
+ }
+ void hashMapInfos::addValue ( string key, vector<int> value )
+ {
+ addHasher ( key, value );
+ }
+ infosHasher hashMapInfos::getHasher ( string key )
+ {
+ long searchKey=hashValue ( key );
+ long foundKey;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- return ( *l_hasher );
+ for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ return ( *l_hasher );
+ }
+ }
+ vector<int> temp;
+ infosHasher defaut(0,"",temp);
+ return defaut;
}
- }
- vector<int> temp;
- infosHasher defaut(0,"",temp);
- return defaut;
-}
-vector<int> hashMapInfos::getValue ( string key )
-{
- long searchKey=hashValue ( key );
- long foundKey;
- vector<int> retour;
+ vector<int> hashMapInfos::getValue ( string key )
+ {
+ long searchKey=hashValue ( key );
+ long foundKey;
+ vector<int> retour;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
+ for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
// cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
- return ( *l_hasher ).getValue();
+ return ( *l_hasher ).getValue();
+ }
+ }
+ return retour;
}
- }
- return retour;
-}
// string hashMapInfos::searchValue ( string value )
// {
// // long searchKey=hashValue ( key );
@@ -129,30 +158,42 @@ vector<int> hashMapInfos::getValue ( string key )
// }
//
-void hashMapInfos::setValue ( string key , vector<int> value )
-{
- long searchKey=hashValue ( key );
- long foundKey;
+ void hashMapInfos::setValue ( string key , vector<int> value )
+ {
+ long searchKey=hashValue ( key );
+ long foundKey;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
- foundKey= ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- ( *l_hasher ).setValue ( value );
+ for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
+ foundKey= ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ ( *l_hasher ).setValue ( value );
// return ( *l_hasher ).getValue();
+ }
+ }
+ }
+ string hashMapInfos::toString ()
+ {
+ stringstream to_return;
+ for ( vector<infosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
+ {
+ to_return << (*l_hasher).toString();
+ // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
+ }
+ return to_return.str();
}
- }
-}
-
-/**
- *
- */
-void hashMapInfos::printHash()
-{
- for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
+ /**
+ *
+ */
+ void hashMapInfos::printHash()
+ {
+ for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ {
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
- }
-}
+ }
+ }
diff --git a/mert/TER/hashMapInfos.h b/mert/TER/hashMapInfos.h
index 8b56e9d02..5e7dbb6e7 100644
--- a/mert/TER/hashMapInfos.h
+++ b/mert/TER/hashMapInfos.h
@@ -1,9 +1,29 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
/*
* Generic hashmap manipulation functions
*/
-#ifndef MERT_TER_HASHMAP_INFOS_H_
-#define MERT_TER_HASHMAP_INFOS_H_
-
+#ifndef __HASHMAPINFOS_H_
+#define __HASHMAPINFOS_H_
+#include <boost/functional/hash.hpp>
#include "infosHasher.h"
#include <vector>
#include <string>
@@ -14,29 +34,32 @@ using namespace std;
namespace HashMapSpace
{
-class hashMapInfos
-{
-private:
- vector<infosHasher> m_hasher;
+ class hashMapInfos
+ {
+ private:
+ vector<infosHasher> m_hasher;
-public:
+ public:
// ~hashMap();
- long hashValue ( string key );
- int trouve ( long searchKey );
- int trouve ( string key );
- void addHasher ( string key, vector<int> value );
- void addValue ( string key, vector<int> value );
- infosHasher getHasher ( string key );
- vector<int> getValue ( string key );
+ long hashValue ( string key );
+ int trouve ( long searchKey );
+ int trouve ( string key );
+ void addHasher ( string key, vector<int> value );
+ void addValue ( string key, vector<int> value );
+ infosHasher getHasher ( string key );
+ vector<int> getValue ( string key );
// string searchValue ( string key );
- void setValue ( string key , vector<int> value );
- void printHash();
- vector<infosHasher> getHashMap();
- string printStringHash();
- string printStringHash2();
- string printStringHashForLexicon();
-};
+ void setValue ( string key , vector<int> value );
+ void printHash();
+ string toString();
+ vector<infosHasher> getHashMap();
+ string printStringHash();
+ string printStringHash2();
+ string printStringHashForLexicon();
+ };
+
}
-#endif // MERT_TER_HASHMAP_INFOS_H_
+
+#endif
diff --git a/mert/TER/hashMapStringInfos.cpp b/mert/TER/hashMapStringInfos.cpp
index 0fbb0a98a..773c148d4 100644
--- a/mert/TER/hashMapStringInfos.cpp
+++ b/mert/TER/hashMapStringInfos.cpp
@@ -1,3 +1,23 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "hashMapStringInfos.h"
// The following class defines a hash function for strings
@@ -7,157 +27,179 @@ using namespace std;
namespace HashMapSpace
{
-// hashMapStringInfos::hashMap();
-/* hashMapStringInfos::~hashMap()
-{
-// vector<stringInfosHasher>::const_iterator del = m_hasher.begin();
- for ( vector<stringInfosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
- {
- delete(*del);
- }
-}*/
-/**
-* int hashMapStringInfos::trouve ( long searchKey )
-* @param searchKey
-* @return
-*/
-int hashMapStringInfos::trouve ( long searchKey )
-{
- long foundKey;
- // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
- foundKey = ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- return 1;
+ // hashMapStringInfos::hashMap();
+ /* hashMapStringInfos::~hashMap()
+ {
+ // vector<stringInfosHasher>::const_iterator del = m_hasher.begin();
+ for ( vector<stringInfosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
+ {
+ delete(*del);
+ }
+ }*/
+ /**
+ * int hashMapStringInfos::trouve ( long searchKey )
+ * @param searchKey
+ * @return
+ */
+ int hashMapStringInfos::trouve ( long searchKey )
+ {
+ long foundKey;
+ // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
+ for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
+ {
+ foundKey = ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ return 1;
+ }
+ }
+ return 0;
}
- }
- return 0;
-}
-int hashMapStringInfos::trouve ( string key )
-{
- long searchKey = hashValue ( key );
- long foundKey;;
- // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
- foundKey = ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- return 1;
+ int hashMapStringInfos::trouve ( string key )
+ {
+ long searchKey = hashValue ( key );
+ long foundKey;;
+ // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
+ for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
+ {
+ foundKey = ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ return 1;
+ }
+ }
+ return 0;
}
- }
- return 0;
-}
-/**
-* long hashMapStringInfos::hashValue ( string key )
-* @param key
-* @return
-*/
-long hashMapStringInfos::hashValue ( string key )
-{
- locale loc; // the "C" locale
- const collate<char>& coll = use_facet<collate<char> > ( loc );
- return coll.hash ( key.data(), key.data() + key.length() );
+ /**
+ * long hashMapStringInfos::hashValue ( string key )
+ * @param key
+ * @return
+ */
+ long hashMapStringInfos::hashValue ( string key )
+ {
+ locale loc; // the "C" locale
+ const collate<char>& coll = use_facet<collate<char> > ( loc );
+ return coll.hash ( key.data(), key.data() + key.length() );
// boost::hash<string> hasher;
// return hasher ( key );
-}
-/**
-* void hashMapStringInfos::addHasher ( string key, string value )
-* @param key
-* @param value
-*/
-void hashMapStringInfos::addHasher ( string key, vector<string> value )
-{
- if ( trouve ( hashValue ( key ) ) == 0 ) {
- // cerr << "ICI1" <<endl;
- stringInfosHasher H ( hashValue ( key ), key, value );
- // cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
- // cerr << "ICI2" <<endl;
-
- m_hasher.push_back ( H );
- }
-}
-void hashMapStringInfos::addValue ( string key, vector<string> value )
-{
- addHasher ( key, value );
-}
-stringInfosHasher hashMapStringInfos::getHasher ( string key )
-{
- long searchKey = hashValue ( key );
- long foundKey;
- // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
- foundKey = ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- return ( *l_hasher );
}
- }
- vector<string> tmp;
- stringInfosHasher defaut ( 0, "", tmp );
- return defaut;
-}
-vector<string> hashMapStringInfos::getValue ( string key )
-{
- long searchKey = hashValue ( key );
- long foundKey;
- vector<string> retour;
- // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
- foundKey = ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- // cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
- return ( *l_hasher ).getValue();
+ /**
+ * void hashMapStringInfos::addHasher ( string key, string value )
+ * @param key
+ * @param value
+ */
+ void hashMapStringInfos::addHasher ( string key, vector<string> value )
+ {
+ if ( trouve ( hashValue ( key ) ) == 0 )
+ {
+ // cerr << "ICI1" <<endl;
+ stringInfosHasher H ( hashValue ( key ), key, value );
+ // cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
+ // cerr << "ICI2" <<endl;
+
+ m_hasher.push_back ( H );
+ }
}
- }
- return retour;
-}
-// string hashMapStringInfos::searchValue ( string value )
-// {
-// // long searchKey=hashValue ( key );
-// // long foundKey;
-// vector<int> foundValue;
-//
-// // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
-// for ( vector<stringInfosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
-// {
-// foundValue= ( *l_hasher ).getValue();
-// /* if ( foundValue.compare ( value ) == 0 )
-// {
-// return ( *l_hasher ).getKey();
-// }*/
-// }
-// return "";
-// }
-//
-
-void hashMapStringInfos::setValue ( string key , vector<string> value )
-{
- long searchKey = hashValue ( key );
- long foundKey;
- // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
- for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
- foundKey = ( *l_hasher ).getHashKey();
- if ( searchKey == foundKey ) {
- ( *l_hasher ).setValue ( value );
- // return ( *l_hasher ).getValue();
+ void hashMapStringInfos::addValue ( string key, vector<string> value )
+ {
+ addHasher ( key, value );
+ }
+ stringInfosHasher hashMapStringInfos::getHasher ( string key )
+ {
+ long searchKey = hashValue ( key );
+ long foundKey;
+ // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
+ for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
+ {
+ foundKey = ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ return ( *l_hasher );
+ }
+ }
+ vector<string> tmp;
+ stringInfosHasher defaut ( 0, "", tmp );
+ return defaut;
+ }
+ vector<string> hashMapStringInfos::getValue ( string key )
+ {
+ long searchKey = hashValue ( key );
+ long foundKey;
+ vector<string> retour;
+ // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
+ for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
+ {
+ foundKey = ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ // cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
+ return ( *l_hasher ).getValue();
+ }
+ }
+ return retour;
+ }
+ // string hashMapStringInfos::searchValue ( string value )
+ // {
+ // // long searchKey=hashValue ( key );
+ // // long foundKey;
+ // vector<int> foundValue;
+ //
+ // // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
+ // for ( vector<stringInfosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
+ // {
+ // foundValue= ( *l_hasher ).getValue();
+ // /* if ( foundValue.compare ( value ) == 0 )
+ // {
+ // return ( *l_hasher ).getKey();
+ // }*/
+ // }
+ // return "";
+ // }
+ //
+
+ void hashMapStringInfos::setValue ( string key , vector<string> value )
+ {
+ long searchKey = hashValue ( key );
+ long foundKey;
+ // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
+ for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
+ {
+ foundKey = ( *l_hasher ).getHashKey();
+ if ( searchKey == foundKey )
+ {
+ ( *l_hasher ).setValue ( value );
+ // return ( *l_hasher ).getValue();
+ }
+ }
}
- }
-}
+ string hashMapStringInfos::toString ()
+ {
+ stringstream to_return;
+ for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
+ {
+ to_return << (*l_hasher).toString();
+ // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
+ }
+ return to_return.str();
+ }
-/**
-*
-*/
-void hashMapStringInfos::printHash()
-{
- for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
- // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
- }
-}
-vector< stringInfosHasher > hashMapStringInfos::getHashMap()
-{
- return m_hasher;
-}
+ /**
+ *
+ */
+ void hashMapStringInfos::printHash()
+ {
+ for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
+ {
+ // cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
+ }
+ }
+ vector< stringInfosHasher > hashMapStringInfos::getHashMap()
+ {
+ return m_hasher;
+ }
diff --git a/mert/TER/hashMapStringInfos.h b/mert/TER/hashMapStringInfos.h
index 870274f3d..5337d50f2 100644
--- a/mert/TER/hashMapStringInfos.h
+++ b/mert/TER/hashMapStringInfos.h
@@ -1,9 +1,29 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
/*
* Generic hashmap manipulation functions
*/
-#ifndef MERT_TER_HASHMAP_STRING_INFOS_H_
-#define MERT_TER_HASHMAP_STRING_INFOS_H_
-
+#ifndef __HASHMAPSTRINGINFOS_H_
+#define __HASHMAPSTRINGINFOS_H_
+#include <boost/functional/hash.hpp>
#include "stringInfosHasher.h"
#include <vector>
#include <string>
@@ -14,29 +34,32 @@ using namespace std;
namespace HashMapSpace
{
-class hashMapStringInfos
-{
-private:
- vector<stringInfosHasher> m_hasher;
+ class hashMapStringInfos
+ {
+ private:
+ vector<stringInfosHasher> m_hasher;
-public:
+ public:
// ~hashMap();
- long hashValue ( string key );
- int trouve ( long searchKey );
- int trouve ( string key );
- void addHasher ( string key, vector<string> value );
- void addValue ( string key, vector<string> value );
- stringInfosHasher getHasher ( string key );
- vector<string> getValue ( string key );
+ long hashValue ( string key );
+ int trouve ( long searchKey );
+ int trouve ( string key );
+ void addHasher ( string key, vector<string> value );
+ void addValue ( string key, vector<string> value );
+ stringInfosHasher getHasher ( string key );
+ vector<string> getValue ( string key );
// string searchValue ( string key );
- void setValue ( string key , vector<string> value );
- void printHash();
- vector<stringInfosHasher> getHashMap();
- string printStringHash();
- string printStringHash2();
- string printStringHashForLexicon();
-};
+ void setValue ( string key , vector<string> value );
+ void printHash();
+ string toString();
+ vector<stringInfosHasher> getHashMap();
+ string printStringHash();
+ string printStringHash2();
+ string printStringHashForLexicon();
+ };
+
}
-#endif // MERT_TER_HASHMAP_STRING_INFOS_H_
+
+#endif
diff --git a/mert/TER/infosHasher.cpp b/mert/TER/infosHasher.cpp
index 654b0b26f..8ce23ae44 100644
--- a/mert/TER/infosHasher.cpp
+++ b/mert/TER/infosHasher.cpp
@@ -1,34 +1,61 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "infosHasher.h"
// The following class defines a hash function for strings
using namespace std;
+using namespace Tools;
namespace HashMapSpace
{
-infosHasher::infosHasher (long cle,string cleTxt, vector<int> valueVecInt )
-{
- m_hashKey=cle;
- m_key=cleTxt;
- m_value=valueVecInt;
-}
+ infosHasher::infosHasher (long cle,string cleTxt, vector<int> valueVecInt )
+ {
+ m_hashKey=cle;
+ m_key=cleTxt;
+ m_value=valueVecInt;
+ }
// infosHasher::~infosHasher(){};*/
-long infosHasher::getHashKey()
-{
- return m_hashKey;
-}
-string infosHasher::getKey()
-{
- return m_key;
-}
-vector<int> infosHasher::getValue()
-{
- return m_value;
-}
-void infosHasher::setValue ( vector<int> value )
-{
- m_value=value;
-}
+ long infosHasher::getHashKey()
+ {
+ return m_hashKey;
+ }
+ string infosHasher::getKey()
+ {
+ return m_key;
+ }
+ vector<int> infosHasher::getValue()
+ {
+ return m_value;
+ }
+ void infosHasher::setValue ( vector<int> value )
+ {
+ m_value=value;
+ }
+ string infosHasher::toString()
+ {
+ stringstream to_return;
+ to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl;
+ return to_return.str();
+ }
// typedef stdext::hash_map<std::string,string, stringhasher> HASH_S_S;
diff --git a/mert/TER/infosHasher.h b/mert/TER/infosHasher.h
index 02a32280b..d3d56317a 100644
--- a/mert/TER/infosHasher.h
+++ b/mert/TER/infosHasher.h
@@ -1,31 +1,54 @@
-#ifndef MERT_TER_INFO_SHASHER_H_
-#define MERT_TER_INFO_SHASHER_H_
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
+#ifndef __INFOSHASHER_H_
+#define __INFOSHASHER_H_
#include <string>
+// #include <ext/hash_map>
#include <stdio.h>
#include <iostream>
+#include <sstream>
#include <vector>
+#include "tools.h"
using namespace std;
namespace HashMapSpace
{
-class infosHasher
-{
-private:
- long m_hashKey;
- string m_key;
- vector<int> m_value;
+ class infosHasher
+ {
+ private:
+ long m_hashKey;
+ string m_key;
+ vector<int> m_value;
-public:
- infosHasher ( long cle, string cleTxt, vector<int> valueVecInt );
- long getHashKey();
- string getKey();
- vector<int> getValue();
- void setValue ( vector<int> value );
+ public:
+ infosHasher ( long cle, string cleTxt, vector<int> valueVecInt );
+ long getHashKey();
+ string getKey();
+ vector<int> getValue();
+ void setValue ( vector<int> value );
+ string toString();
-};
+ };
-}
-#endif // MERT_TER_INFO_SHASHER_H_
+}
+#endif \ No newline at end of file
diff --git a/mert/TER/stringHasher.cpp b/mert/TER/stringHasher.cpp
index 24fde0e32..f4d1526e8 100644
--- a/mert/TER/stringHasher.cpp
+++ b/mert/TER/stringHasher.cpp
@@ -1,3 +1,23 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "stringHasher.h"
// The following class defines a hash function for strings
@@ -6,29 +26,29 @@ using namespace std;
namespace HashMapSpace
{
-stringHasher::stringHasher ( long cle, string cleTxt, string valueTxt )
-{
- m_hashKey=cle;
- m_key=cleTxt;
- m_value=valueTxt;
-}
+ stringHasher::stringHasher ( long cle, string cleTxt, string valueTxt )
+ {
+ m_hashKey=cle;
+ m_key=cleTxt;
+ m_value=valueTxt;
+ }
// stringHasher::~stringHasher(){};*/
-long stringHasher::getHashKey()
-{
- return m_hashKey;
-}
-string stringHasher::getKey()
-{
- return m_key;
-}
-string stringHasher::getValue()
-{
- return m_value;
-}
-void stringHasher::setValue ( string value )
-{
- m_value=value;
-}
+ long stringHasher::getHashKey()
+ {
+ return m_hashKey;
+ }
+ string stringHasher::getKey()
+ {
+ return m_key;
+ }
+ string stringHasher::getValue()
+ {
+ return m_value;
+ }
+ void stringHasher::setValue ( string value )
+ {
+ m_value=value;
+ }
// typedef stdext::hash_map<string, string, stringhasher> HASH_S_S;
diff --git a/mert/TER/stringHasher.h b/mert/TER/stringHasher.h
index 897bd9ff5..d831f642c 100644
--- a/mert/TER/stringHasher.h
+++ b/mert/TER/stringHasher.h
@@ -1,28 +1,50 @@
-#ifndef MERT_TER_STRING_HASHER_H_
-#define MERT_TER_STRING_HASHER_H_
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
+#ifndef __STRINGHASHER_H_
+#define __STRINGHASHER_H_
#include <string>
+//#include <ext/hash_map>
#include <iostream>
using namespace std;
namespace HashMapSpace
{
-class stringHasher
-{
-private:
- long m_hashKey;
- string m_key;
- string m_value;
-
-public:
- stringHasher ( long cle, string cleTxt, string valueTxt );
- long getHashKey();
- string getKey();
- string getValue();
- void setValue ( string value );
-};
+ class stringHasher
+ {
+ private:
+ long m_hashKey;
+ string m_key;
+ string m_value;
-}
+ public:
+ stringHasher ( long cle, string cleTxt, string valueTxt );
+ long getHashKey();
+ string getKey();
+ string getValue();
+ void setValue ( string value );
-#endif // MERT_TER_STRING_HASHER_H_
+
+ };
+
+
+}
+#endif
diff --git a/mert/TER/stringInfosHasher.cpp b/mert/TER/stringInfosHasher.cpp
index 3e02e7a20..007fd720f 100644
--- a/mert/TER/stringInfosHasher.cpp
+++ b/mert/TER/stringInfosHasher.cpp
@@ -1,34 +1,61 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "stringInfosHasher.h"
// The following class defines a hash function for strings
using namespace std;
+using namespace Tools;
namespace HashMapSpace
{
-stringInfosHasher::stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt )
-{
- m_hashKey=cle;
- m_key=cleTxt;
- m_value=valueVecInt;
-}
+ stringInfosHasher::stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt )
+ {
+ m_hashKey=cle;
+ m_key=cleTxt;
+ m_value=valueVecInt;
+ }
// stringInfosHasher::~stringInfosHasher(){};*/
-long stringInfosHasher::getHashKey()
-{
- return m_hashKey;
-}
-string stringInfosHasher::getKey()
-{
- return m_key;
-}
-vector<string> stringInfosHasher::getValue()
-{
- return m_value;
-}
-void stringInfosHasher::setValue ( vector<string> value )
-{
- m_value=value;
-}
+ long stringInfosHasher::getHashKey()
+ {
+ return m_hashKey;
+ }
+ string stringInfosHasher::getKey()
+ {
+ return m_key;
+ }
+ vector<string> stringInfosHasher::getValue()
+ {
+ return m_value;
+ }
+ void stringInfosHasher::setValue ( vector<string> value )
+ {
+ m_value=value;
+ }
+ string stringInfosHasher::toString()
+ {
+ stringstream to_return;
+ to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl;
+ return to_return.str();
+ }
// typedef stdext::hash_map<string, string, stringhasher> HASH_S_S;
diff --git a/mert/TER/stringInfosHasher.h b/mert/TER/stringInfosHasher.h
index c1b891662..307b48da7 100644
--- a/mert/TER/stringInfosHasher.h
+++ b/mert/TER/stringInfosHasher.h
@@ -1,28 +1,52 @@
-#ifndef MERT_TER_STRING_INFOS_HASHER_H_
-#define MERT_TER_STRING_INFOS_HASHER_H_
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
+#ifndef __STRINGINFOSHASHER_H_
+#define __STRINGINFOSHASHER_H_
#include <string>
+// #include <ext/hash_map>
#include <iostream>
#include <vector>
+#include "tools.h"
using namespace std;
namespace HashMapSpace
{
-class stringInfosHasher
-{
-private:
- long m_hashKey;
- string m_key;
- vector<string> m_value;
-
-public:
- stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt );
- long getHashKey();
- string getKey();
- vector<string> getValue();
- void setValue ( vector<string> value );
-};
+ class stringInfosHasher
+ {
+ private:
+ long m_hashKey;
+ string m_key;
+ vector<string> m_value;
-}
+ public:
+ stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt );
+ long getHashKey();
+ string getKey();
+ vector<string> getValue();
+ void setValue ( vector<string> value );
+ string toString();
-#endif // MERT_TER_STRING_INFOS_HASHER_H_
+
+ };
+
+
+}
+#endif \ No newline at end of file
diff --git a/mert/TER/terAlignment.cpp b/mert/TER/terAlignment.cpp
index 87be53b11..6c5d35cc5 100644
--- a/mert/TER/terAlignment.cpp
+++ b/mert/TER/terAlignment.cpp
@@ -1,131 +1,214 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "terAlignment.h"
using namespace std;
namespace TERCpp
{
-terAlignment::terAlignment()
-{
+ terAlignment::terAlignment()
+ {
// vector<string> ref;
// vector<string> hyp;
// vector<string> aftershift;
- // TERshift[] allshifts = null;
+ // TERshift[] allshifts = null;
- numEdits=0;
- numWords=0;
- bestRef="";
+ numEdits=0;
+ numWords=0;
+ bestRef="";
- numIns=0;
- numDel=0;
- numSub=0;
- numSft=0;
- numWsf=0;
-}
-string terAlignment::toString()
-{
- stringstream s;
- s.str ( "" );
- s << "Original Ref: " << join ( " ", ref ) << endl;
- s << "Original Hyp: " << join ( " ", hyp ) <<endl;
- s << "Hyp After Shift: " << join ( " ", aftershift );
- s << endl;
+ numIns=0;
+ numDel=0;
+ numSub=0;
+ numSft=0;
+ numWsf=0;
+ }
+ string terAlignment::toString()
+ {
+ stringstream s;
+ s.str ( "" );
+ s << "Original Ref: \t" << join ( " ", ref ) << endl;
+ s << "Original Hyp: \t" << join ( " ", hyp ) <<endl;
+ s << "Hyp After Shift:\t" << join ( " ", aftershift );
+// s << "Hyp After Shift: " << join ( " ", aftershift );
+ s << endl;
// string s = "Original Ref: " + join(" ", ref) + "\nOriginal Hyp: " + join(" ", hyp) + "\nHyp After Shift: " + join(" ", aftershift);
- if ( ( int ) sizeof ( alignment ) >0 ) {
- s << "Alignment: (";
+ if ( ( int ) sizeof ( alignment ) >0 )
+ {
+ s << "Alignment: (";
// s += "\nAlignment: (";
- for ( int i = 0; i < ( int ) ( alignment.size() ); i++ ) {
- s << alignment[i];
+ for ( int i = 0; i < ( int ) ( alignment.size() ); i++ )
+ {
+ s << alignment[i];
// s+=alignment[i];
- }
+ }
// s += ")";
- s << ")";
- }
- s << endl;
- if ( ( int ) allshifts.size() == 0 ) {
+ s << ")";
+ }
+ s << endl;
+ if ( ( int ) allshifts.size() == 0 )
+ {
// s += "\nNumShifts: 0";
- s << "NumShifts: 0";
- } else {
+ s << "NumShifts: 0";
+ }
+ else
+ {
// s += "\nNumShifts: " + (int)allshifts.size();
- s << "NumShifts: "<< ( int ) allshifts.size();
- for ( int i = 0; i < ( int ) allshifts.size(); i++ ) {
- s << endl << " " ;
- s << ( ( terShift ) allshifts[i] ).toString();
+ s << "NumShifts: "<< ( int ) allshifts.size();
+ for ( int i = 0; i < ( int ) allshifts.size(); i++ )
+ {
+ s << endl << " " ;
+ s << ( ( terShift ) allshifts[i] ).toString();
// s += "\n " + allshifts[i];
- }
- }
- s << endl << "Score: " << scoreAv() << " (" << numEdits << "/" << averageWords << ")";
+ }
+ }
+ s << endl << "Score: " << scoreAv() << " (" << numEdits << "/" << averageWords << ")";
// s += "\nScore: " + score() + " (" + numEdits + "/" + numWords + ")";
- return s.str();
+ return s.str();
-}
-string terAlignment::join ( string delim, vector<string> arr )
-{
- if ( ( int ) arr.size() == 0 ) return "";
+ }
+ string terAlignment::join ( string delim, vector<string> arr )
+ {
+ if ( ( int ) arr.size() == 0 ) return "";
// if ((int)delim.compare("") == 0) delim = new String("");
// String s = new String("");
- stringstream s;
- s.str ( "" );
- for ( int i = 0; i < ( int ) arr.size(); i++ ) {
- if ( i == 0 ) {
- s << arr.at ( i );
- } else {
- s << delim << arr.at ( i );
- }
- }
- return s.str();
+ stringstream s;
+ s.str ( "" );
+ for ( int i = 0; i < ( int ) arr.size(); i++ )
+ {
+ if ( i == 0 )
+ {
+ s << arr.at ( i );
+ }
+ else
+ {
+ s << delim << arr.at ( i );
+ }
+ }
+ return s.str();
// return "";
-}
-double terAlignment::score()
-{
- if ( ( numWords <= 0.0 ) && ( numEdits > 0.0 ) ) {
- return 1.0;
- }
- if ( numWords <= 0.0 ) {
- return 0.0;
- }
- return ( double ) numEdits / numWords;
-}
-double terAlignment::scoreAv()
-{
- if ( ( averageWords <= 0.0 ) && ( numEdits > 0.0 ) ) {
- return 1.0;
- }
- if ( averageWords <= 0.0 ) {
- return 0.0;
- }
- return ( double ) numEdits / averageWords;
-}
-
-void terAlignment::scoreDetails()
-{
- numIns = numDel = numSub = numWsf = numSft = 0;
- if((int)allshifts.size()>0) {
- for(int i = 0; i < (int)allshifts.size(); ++i) {
- numWsf += allshifts[i].size();
}
- numSft = allshifts.size();
- }
-
- if((int)alignment.size()>0 ) {
- for(int i = 0; i < (int)alignment.size(); ++i) {
- switch (alignment[i]) {
- case 'S':
- case 'T':
- numSub++;
- break;
- case 'D':
- numDel++;
- break;
- case 'I':
- numIns++;
- break;
- }
+ double terAlignment::score()
+ {
+ if ( ( numWords <= 0.0 ) && ( numEdits > 0.0 ) )
+ {
+ return 1.0;
+ }
+ if ( numWords <= 0.0 )
+ {
+ return 0.0;
+ }
+ return ( double ) numEdits / numWords;
+ }
+ double terAlignment::scoreAv()
+ {
+ if ( ( averageWords <= 0.0 ) && ( numEdits > 0.0 ) )
+ {
+ return 1.0;
+ }
+ if ( averageWords <= 0.0 )
+ {
+ return 0.0;
+ }
+ return ( double ) numEdits / averageWords;
}
+
+ void terAlignment::scoreDetails()
+ {
+ numIns = numDel = numSub = numWsf = numSft = 0;
+ if((int)allshifts.size()>0)
+ {
+ for(int i = 0; i < (int)allshifts.size(); ++i)
+ {
+ numWsf += allshifts[i].size();
+ }
+ numSft = allshifts.size();
+ }
+
+ if((int)alignment.size()>0 )
+ {
+ for(int i = 0; i < (int)alignment.size(); ++i)
+ {
+ switch (alignment[i])
+ {
+ case 'S':
+ case 'T':
+ numSub++;
+ break;
+ case 'D':
+ numDel++;
+ break;
+ case 'I':
+ numIns++;
+ break;
+ }
+ }
+ }
+ // if(numEdits != numSft + numDel + numIns + numSub)
+ // System.out.println("** Error, unmatch edit erros " + numEdits +
+ // " vs " + (numSft + numDel + numIns + numSub));
+ }
+ string terAlignment::printAlignments()
+ {
+ stringstream to_return;
+ for(int i = 0; i < (int)alignment.size(); ++i)
+ {
+ char alignInfo=alignment.at(i);
+ if (alignInfo == 'A' )
+ {
+ alignInfo='A';
+ }
+
+ if (i==0)
+ {
+ to_return << alignInfo;
+ }
+ else
+ {
+ to_return << " " << alignInfo;
+ }
+ }
+ return to_return.str();
}
- // if(numEdits != numSft + numDel + numIns + numSub)
- // System.out.println("** Error, unmatch edit erros " + numEdits +
- // " vs " + (numSft + numDel + numIns + numSub));
+string terAlignment::printAllShifts()
+{
+ stringstream to_return;
+ if ( ( int ) allshifts.size() == 0 )
+ {
+// s += "\nNumShifts: 0";
+ to_return << "NbrShifts: 0";
+ }
+ else
+ {
+// s += "\nNumShifts: " + (int)allshifts.size();
+ to_return << "NbrShifts: "<< ( int ) allshifts.size();
+ for ( int i = 0; i < ( int ) allshifts.size(); i++ )
+ {
+ to_return << "\t" ;
+ to_return << ( ( terShift ) allshifts[i] ).toString();
+// s += "\n " + allshifts[i];
+ }
+ }
+ return to_return.str();
}
} \ No newline at end of file
diff --git a/mert/TER/terAlignment.h b/mert/TER/terAlignment.h
index c8c82eac8..0af86f663 100644
--- a/mert/TER/terAlignment.h
+++ b/mert/TER/terAlignment.h
@@ -1,5 +1,26 @@
-#ifndef MERT_TER_TER_ALIGNMENT_H_
-#define MERT_TER_TER_ALIGNMENT_H_
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
+#ifndef MERT_TER_TERALIGNMENT_H_
+#define MERT_TER_TERALIGNMENT_H_
+
#include <vector>
#include <stdio.h>
@@ -13,39 +34,41 @@ using namespace std;
namespace TERCpp
{
-class terAlignment
-{
-private:
-public:
-
- terAlignment();
- string toString();
- void scoreDetails();
+ class terAlignment
+ {
+ private:
+ public:
- vector<string> ref;
- vector<string> hyp;
- vector<string> aftershift;
+ terAlignment();
+ string toString();
+ void scoreDetails();
- vector<terShift> allshifts;
+ vector<string> ref;
+ vector<string> hyp;
+ vector<string> aftershift;
+ vector<terShift> allshifts;
+ vector<int> hyp_int;
+ vector<int> aftershift_int;
- double numEdits;
- double numWords;
- double averageWords;
- vector<char> alignment;
- string bestRef;
+ double numEdits;
+ double numWords;
+ double averageWords;
+ vector<char> alignment;
+ string bestRef;
- int numIns;
- int numDel;
- int numSub;
- int numSft;
- int numWsf;
+ int numIns;
+ int numDel;
+ int numSub;
+ int numSft;
+ int numWsf;
- string join ( string delim, vector<string> arr );
- double score();
- double scoreAv();
-};
+ string join ( string delim, vector<string> arr );
+ double score();
+ double scoreAv();
+ string printAlignments();
+ string printAllShifts();
+ };
}
-
-#endif // MERT_TER_TER_ALIGNMENT_H__
+#endif \ No newline at end of file
diff --git a/mert/TER/terShift.cpp b/mert/TER/terShift.cpp
index 428803849..c1106db76 100644
--- a/mert/TER/terShift.cpp
+++ b/mert/TER/terShift.cpp
@@ -1,3 +1,23 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "terShift.h"
using namespace std;
@@ -22,32 +42,32 @@ namespace TERCpp
// numSft=0;
// numWsf=0;
// }
-terShift::terShift ()
-{
- start = 0;
- end = 0;
- moveto = 0;
- newloc = 0;
- cost=1.0;
-}
-terShift::terShift ( int _start, int _end, int _moveto, int _newloc )
-{
- start = _start;
- end = _end;
- moveto = _moveto;
- newloc = _newloc;
- cost=1.0;
-}
+ terShift::terShift ()
+ {
+ start = 0;
+ end = 0;
+ moveto = 0;
+ newloc = 0;
+ cost=1.0;
+ }
+ terShift::terShift ( int _start, int _end, int _moveto, int _newloc )
+ {
+ start = _start;
+ end = _end;
+ moveto = _moveto;
+ newloc = _newloc;
+ cost=1.0;
+ }
-terShift::terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted )
-{
- start = _start;
- end = _end;
- moveto = _moveto;
- newloc = _newloc;
- shifted = _shifted;
- cost=1.0;
-}
+ terShift::terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted )
+ {
+ start = _start;
+ end = _end;
+ moveto = _moveto;
+ newloc = _newloc;
+ shifted = _shifted;
+ cost=1.0;
+ }
// string terShift::vectorToString(vector<string> vec)
// {
// string retour("");
@@ -58,38 +78,44 @@ terShift::terShift ( int _start, int _end, int _moveto, int _newloc, vector<stri
// return retour;
// }
-string terShift::toString()
-{
- stringstream s;
- s.str ( "" );
- s << "[" << start << ", " << end << ", " << moveto << "/" << newloc << "]";
- if ( ( int ) shifted.size() > 0 ) {
- s << " (" << vectorToString ( shifted ) << ")";
- }
- return s.str();
-}
+ string terShift::toString()
+ {
+ stringstream s;
+ s.str ( "" );
+ s << "[" << start << ", " << end << ", " << moveto << "/" << newloc << "]";
+ if ( ( int ) shifted.size() > 0 )
+ {
+ s << " (" << vectorToString ( shifted ) << ")";
+ }
+ return s.str();
+ }
-/* The distance of the shift. */
-int terShift::distance()
-{
- if ( moveto < start ) {
- return start - moveto;
- } else if ( moveto > end ) {
- return moveto - end;
- } else {
- return moveto - start;
- }
-}
+ /* The distance of the shift. */
+ int terShift::distance()
+ {
+ if ( moveto < start )
+ {
+ return start - moveto;
+ }
+ else if ( moveto > end )
+ {
+ return moveto - end;
+ }
+ else
+ {
+ return moveto - start;
+ }
+ }
-bool terShift::leftShift()
-{
- return ( moveto < start );
-}
+ bool terShift::leftShift()
+ {
+ return ( moveto < start );
+ }
-int terShift::size()
-{
- return ( end - start ) + 1;
-}
+ int terShift::size()
+ {
+ return ( end - start ) + 1;
+ }
// terShift terShift::operator=(terShift t)
// {
//
diff --git a/mert/TER/terShift.h b/mert/TER/terShift.h
index 679a7c8bb..ba84a5947 100644
--- a/mert/TER/terShift.h
+++ b/mert/TER/terShift.h
@@ -1,5 +1,26 @@
-#ifndef MERT_TER_TER_SHIFT_H_
-#define MERT_TER_TER_SHIFT_H_
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
+#ifndef MERT_TER_TERSHIFT_H_
+#define MERT_TER_TERSHIFT_H_
+
#include <vector>
#include <stdio.h>
@@ -7,38 +28,38 @@
#include <sstream>
#include "tools.h"
+
using namespace std;
using namespace Tools;
namespace TERCpp
{
-class terShift
-{
-private:
-public:
-
- terShift();
- terShift ( int _start, int _end, int _moveto, int _newloc );
- terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted );
- string toString();
- int distance() ;
- bool leftShift();
- int size();
+ class terShift
+ {
+ private:
+ public:
+
+ terShift();
+ terShift ( int _start, int _end, int _moveto, int _newloc );
+ terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted );
+ string toString();
+ int distance() ;
+ bool leftShift();
+ int size();
// terShift operator=(terShift t);
// string vectorToString(vector<string> vec);
- int start;
- int end;
- int moveto;
- int newloc;
- vector<string> shifted; // The words we shifted
- vector<char> alignment ; // for pra_more output
- vector<string> aftershift; // for pra_more output
- // This is used to store the cost of a shift, so we don't have to
- // calculate it multiple times.
- double cost;
-};
+ int start;
+ int end;
+ int moveto;
+ int newloc;
+ vector<string> shifted; // The words we shifted
+ vector<char> alignment ; // for pra_more output
+ vector<string> aftershift; // for pra_more output
+ // This is used to store the cost of a shift, so we don't have to
+ // calculate it multiple times.
+ double cost;
+ };
}
-
-#endif // MERT_TER_TER_SHIFT_H_
+#endif \ No newline at end of file
diff --git a/mert/TER/tercalc.cpp b/mert/TER/tercalc.cpp
index e16f692e8..b7f63772c 100644
--- a/mert/TER/tercalc.cpp
+++ b/mert/TER/tercalc.cpp
@@ -1,3 +1,23 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
//
// C++ Implementation: tercalc
//
@@ -15,1021 +35,902 @@ using namespace Tools;
namespace TERCpp
{
-terCalc::terCalc()
-{
- MAX_SHIFT_SIZE = 50;
- INF = 999999.0;
- shift_cost = 1.0;
- insert_cost = 1.0;
- delete_cost = 1.0;
- substitute_cost = 1.0;
- match_cost = 0.0;
- NUM_SEGMENTS_SCORED = 0;
- NUM_SHIFTS_CONSIDERED = 0;
- NUM_BEAM_SEARCH_CALLS = 0;
- BEAM_WIDTH = 20;
- MAX_SHIFT_DIST = 50;
- PRINT_DEBUG = false;
-}
-
-
-// terCalc::~terCalc()
-// {
-// }
-// size_t* terCalc::hashVec ( vector<string> s )
-// {
-// size_t retour[ ( int ) s.size() ];
-// int i=0;
-// for ( i=0; i< ( int ) s.size(); i++ )
-// {
-// boost::hash<std::string> hasher;
-// retour[i]=hasher ( s.at ( i ) );
-// }
-// return retour;
-// }
-
-
-int terCalc::WERCalculation ( size_t * ref, size_t * hyp )
-{
- int retour;
- int REFSize = sizeof ( ref ) + 1;
- int HYPSize = sizeof ( hyp ) + 1;
- int WER[REFSize][HYPSize];
- int i = 0;
- int j = 0;
- for ( i = 0; i < REFSize; i++ ) {
- WER[i][0] = ( int ) i;
- }
- for ( j = 0; j < HYPSize; j++ ) {
- WER[0][j] = ( int ) j;
- }
- for ( j = 1; j < HYPSize; j++ ) {
- for ( i = 1; i < REFSize; i++ ) {
- if ( i == 1 ) {
- cerr << endl;
- }
- if ( ref[i-1] == hyp[j-1] ) {
- WER[i][j] = WER[i-1][j-1];
- cerr << "- ";
- cerr << WER[i][j] << "-\t";
- } else {
- if ( ( ( WER[i-1][ j] + 1 ) < ( WER[i][j-1] + 1 ) ) && ( ( WER[i-1][j] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
- WER[i][j] = ( WER[i-1][j] + 1 );
-// cerr << "D ";
- cerr << WER[i][j] << "D\t";
- } else {
- if ( ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j] + 1 ) ) && ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
- WER[i][j] = ( WER[i][j-1] + 1 );
-// cerr << "I ";
- cerr << WER[i][j] << "I\t";
- } else {
- WER[i][j] = ( WER[i-1][j-1] + 1 );
-// cerr << "S ";
- cerr << WER[i][j] << "S\t";
- }
- }
- }
- }
- }
- cerr << endl;
- retour = WER[i-1][j-1];
- cerr << "i : " << i - 1 << "\tj : " << j - 1 << endl;
- return retour;
-}
-int terCalc::WERCalculation ( std::vector< int > ref, std::vector< int > hyp )
-{
- stringstream s;
- s.str ( "" );
- string stringRef ( "" );
- string stringHyp ( "" );
- for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) {
- if ( l_it == ref.begin() ) {
- s << ( *l_it );
- } else {
- s << " " << ( *l_it );
- }
- }
- stringRef = s.str();
- s.str ( "" );
- for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) {
- if ( l_itHyp == hyp.begin() ) {
- s << ( *l_itHyp );
- } else {
- s << " " << ( *l_itHyp );
- }
- }
- stringHyp = s.str();
- s.str ( "" );
- return WERCalculation ( stringToVector ( stringRef, " " ), stringToVector ( stringHyp , " " ) );
-}
-
-terAlignment terCalc::TER ( std::vector< int > hyp, std::vector< int > ref )
-{
- stringstream s;
- s.str ( "" );
- string stringRef ( "" );
- string stringHyp ( "" );
- for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) {
- if ( l_it == ref.begin() ) {
- s << ( *l_it );
- } else {
- s << " " << ( *l_it );
- }
- }
- stringRef = s.str();
- s.str ( "" );
- for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) {
- if ( l_itHyp == hyp.begin() ) {
- s << ( *l_itHyp );
- } else {
- s << " " << ( *l_itHyp );
+ terCalc::terCalc()
+ {
+ TAILLE_PERMUT_MAX = 50;
+ infinite = 999999.0;
+ shift_cost = 1.0;
+ insert_cost = 1.0;
+ delete_cost = 1.0;
+ substitute_cost = 1.0;
+ match_cost = 0.0;
+ NBR_SEGS_EVALUATED = 0;
+ NBR_PERMUTS_CONSID = 0;
+ NBR_BS_APPELS = 0;
+ TAILLE_BEAM = 20;
+ DIST_MAX_PERMUT = 50;
+ PRINT_DEBUG = false;
+ hypSpans.clear();
+ refSpans.clear();
+ }
+
+
+ terAlignment terCalc::WERCalculation ( vector< string > hyp , vector< string > ref )
+ {
+
+ return minimizeDistanceEdition ( hyp, ref, hypSpans );
+
}
- }
- stringHyp = s.str();
- s.str ( "" );
- return TER ( stringToVector ( stringRef , " " ), stringToVector ( stringHyp , " " ) );
-}
-int terCalc::WERCalculation ( vector<string> ref, vector<string> hyp )
-{
- int retour;
- int REFSize = ( int ) ref.size() + 1;
- int HYPSize = ( int ) hyp.size() + 1;
- int WER[REFSize][HYPSize];
- char WERchar[REFSize][HYPSize];
- int i = 0;
- int j = 0;
- for ( i = 0; i < REFSize; i++ ) {
- WER[i][0] = ( int ) i;
- }
- for ( j = 0; j < HYPSize; j++ ) {
- WER[0][j] = ( int ) j;
- }
- for ( j = 1; j < HYPSize; j++ ) {
- for ( i = 1; i < REFSize; i++ ) {
-// if (i==1)
-// {
-// cerr << endl;
-// }
- if ( ref[i-1] == hyp[j-1] ) {
- WER[i][j] = WER[i-1][j-1];
-// cerr << "- ";
-// cerr << WER[i][j]<< "-\t";
- WERchar[i][j] = '-';
- } else {
- if ( ( ( WER[i-1][ j] + 1 ) < ( WER[i][j-1] + 1 ) ) && ( ( WER[i-1][j] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
- WER[i][j] = ( WER[i-1][j] + 1 );
-// cerr << "D ";
-// cerr << WER[i][j]<< "D\t";
- WERchar[i][j] = 'D';
- } else {
- if ( ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j] + 1 ) ) && ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
- WER[i][j] = ( WER[i][j-1] + 1 );
-// cerr << "I ";
-// cerr << WER[i][j]<< "I\t";
- WERchar[i][j] = 'I';
- } else {
- WER[i][j] = ( WER[i-1][j-1] + 1 );
-// cerr << "S ";
-// cerr << WER[i][j]<< "S\t";
- WERchar[i][j] = 'S';
- }
- }
- }
- }
- }
- cerr << endl;
- retour = WER[REFSize-1][HYPSize-1];
- cerr << "i : " << i - 1 << "\tj : " << j - 1 << endl;
- j = HYPSize - 1;
- i = REFSize - 1;
- int k;
- stringstream s;
-// WERalignment local[HYPSize];
- if ( HYPSize > REFSize ) {
- k = HYPSize;
- } else {
- k = REFSize;
- }
- WERalignment local;
- while ( j > 0 && i > 0 ) {
- cerr << "indice i : " << i << "\t";
- cerr << "indice j : " << j << endl;
- if ( ( j == HYPSize - 1 ) && ( i == REFSize - 1 ) ) {
- alignmentElement localInfos;
- s << WER[i][j];
- localInfos.push_back ( s.str() );
- s.str ( "" );
- s << WERchar[i][j];
- localInfos.push_back ( s.str() );
- s.str ( "" );
- local.push_back ( localInfos );
-// // i--;
-// j--;
- }
-// else
+ terAlignment terCalc::TER ( std::vector< int > hyp, std::vector< int > ref )
{
- if ( ( ( WER[i-1][j-1] ) <= ( WER[i-1][j] ) ) && ( ( WER[i-1][j-1] ) <= ( WER[i][j-1] ) ) ) {
- alignmentElement localInfos;
- s << WER[i-1][j-1];
- localInfos.push_back ( s.str() );
+ stringstream s;
s.str ( "" );
- s << WERchar[i-1][j-1];
- localInfos.push_back ( s.str() );
+ string stringRef ( "" );
+ string stringHyp ( "" );
+ for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ )
+ {
+ if ( l_it == ref.begin() )
+ {
+ s << ( *l_it );
+ }
+ else
+ {
+ s << " " << ( *l_it );
+ }
+ }
+ stringRef = s.str();
s.str ( "" );
- local.push_back ( localInfos );
- i--;
- j--;
- } else {
- if ( ( ( WER[i][j-1] ) <= ( WER[i-1][j] ) ) && ( ( WER[i][j-1] ) <= ( WER[i-1][j-1] ) ) ) {
- alignmentElement localInfos;
- s << WER[i][j-1];
- localInfos.push_back ( s.str() );
- s.str ( "" );
- s << WERchar[i][j-1];
- localInfos.push_back ( s.str() );
- s.str ( "" );
- local.push_back ( localInfos );
- j--;
- } else {
- alignmentElement localInfos;
- s << WER[i-1][j];
- localInfos.push_back ( s.str() );
- s.str ( "" );
- s << WERchar[i-1][j];
- localInfos.push_back ( s.str() );
- s.str ( "" );
- local.push_back ( localInfos );
- i--;
+ for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ )
+ {
+ if ( l_itHyp == hyp.begin() )
+ {
+ s << ( *l_itHyp );
+ }
+ else
+ {
+ s << " " << ( *l_itHyp );
+ }
}
- }
- }
- }
-
- for ( j = 1; j < HYPSize; j++ ) {
- for ( i = 1; i < REFSize; i++ ) {
- cerr << WERchar[i][j] << " ";
- }
- cerr << endl;
- }
- cerr << endl;
- for ( j = 1; j < HYPSize; j++ ) {
- for ( i = 1; i < REFSize; i++ ) {
- cerr << WER[i][j] << " ";
+ stringHyp = s.str();
+ s.str ( "" );
+ return TER ( stringToVector ( stringRef , " " ), stringToVector ( stringHyp , " " ) );
}
- cerr << endl;
- }
- cerr << "=================" << endl;
-// k=local.size()-1;
-// while (k>0)
-// {
-// alignmentElement localInfos;
-// localInfos=local.at(k-1);
-// l_WERalignment.push_back(localInfos);
-// cerr << (string)localInfos.at(1)+"\t";
- k--;
-// }
-// cerr<<endl;
- k = local.size() - 1;
- int l = 0;
- int m = 0;
- while ( k > 0 ) {
- alignmentElement localInfos;
- localInfos = local.at ( k - 1 );
- if ( ( int ) ( localInfos.at ( 1 ).compare ( "D" ) ) == 0 || l > HYPSize - 1 ) {
- localInfos.push_back ( "***" );
- } else {
- localInfos.push_back ( hyp.at ( l ) );
- l++;
- }
- if ( ( int ) ( localInfos.at ( 1 ).compare ( "I" ) ) == 0 || m > REFSize - 1 ) {
- localInfos.push_back ( "***" );
- } else {
- localInfos.push_back ( ref.at ( m ) );
- m++;
+ hashMapInfos terCalc::createConcordMots ( vector<string> hyp, vector<string> ref )
+ {
+ hashMap tempHash;
+ hashMapInfos retour;
+ for ( int i = 0; i < ( int ) hyp.size(); i++ )
+ {
+ tempHash.addHasher ( hyp.at ( i ), "" );
+ }
+ bool cor[ref.size() ];
+ for ( int i = 0; i < ( int ) ref.size(); i++ )
+ {
+ if ( tempHash.trouve ( ( string ) ref.at ( i ) ) )
+ {
+ cor[i] = true;
+ }
+ else
+ {
+ cor[i] = false;
+ }
+ }
+ for ( int start = 0; start < ( int ) ref.size(); start++ )
+ {
+ if ( cor[start] )
+ {
+ for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= TAILLE_PERMUT_MAX ) && ( cor[end] ) );end++ )
+ {
+ vector<string> ajouter = subVector ( ref, start, end + 1 );
+ string ajouterString = vectorToString ( ajouter );
+ vector<int> values = retour.getValue ( ajouterString );
+ values.push_back ( start );
+ if ( values.size() > 1 )
+ {
+ retour.setValue ( ajouterString, values );
+ }
+ else
+ {
+ retour.addValue ( ajouterString, values );
+ }
+ }
+ }
+ }
+ return retour;
}
-// cerr << vectorToString(localInfos)<<endl;
-// cerr <<localInfos.at(0)<<"\t"<<localInfos.at(1)<<"\t"<<localInfos.at(2)<<"\t"<<localInfos.at(3)<<endl;
- l_WERalignment.push_back ( localInfos );
-// cerr << (string)localInfos.at(1)+"\t";
- k--;
- }
- cerr << endl;
- /* k=local.size()-1;
- while (k>0)
- {
- alignmentElement localInfos;
- localInfos=local.at(k-1);
- // l_WERalignment.push_back(localInfos);
- cerr << (string)localInfos.at(0)+"\t";
- k--;
- }
- cerr<<endl;*/
- k = 0;
-// k=l_WERalignment.size()-1;
- m = 0;
- while ( k < ( int ) l_WERalignment.size() ) {
- alignmentElement localInfos;
- localInfos = l_WERalignment.at ( k );
- cerr << localInfos.at ( 0 ) << "\t" << localInfos.at ( 1 ) << "\t" << localInfos.at ( 2 ) << "\t" << localInfos.at ( 3 ) << endl;
- /* if ((int)(localInfos.at(1).compare("I"))==0)
- {
- cerr << "***\t";
- }
- else
- {
- // if (m<ref.size())
- {
- cerr << ref.at(m) << "\t";
- }
- m++;
- }
- */
- k++;
- }
- cerr << endl;
- /* k=local.size()-1;
- l=0;
- while (k>0)
- {
- alignmentElement localInfos;
- localInfos=local.at(k-1);
- // l_WERalignment.push_back(localInfos);
- if ((int)(localInfos.at(1).compare("D"))==0)
- {
- cerr << "***\t";
- }
- else
- {
- cerr << hyp.at(l) << "\t";
- l++;
- }
- k--;
- }
- cerr<<endl;*/
- cerr << "=================" << endl;
- return retour;
-}
-
-// string terCalc::vectorToString(vector<string> vec)
-// {
-// string retour("");
-// for (vector<string>::iterator vecIter=vec.begin();vecIter!=vec.end(); vecIter++)
-// {
-// retour+=(*vecIter)+"\t";
-// }
-// return retour;
-// }
-// vector<string> terCalc::subVector(vector<string> vec, int start, int end)
-// {
-// if (start>end)
-// {
-// cerr << "ERREUR : terCalc::subVector : end > start"<<endl;
-// exit(0);
-// }
-// vector<string> retour;
-// for (int i=start; ((i<end) && (i< vec.size())); i++)
-// {
-// retour.push_back(vec.at(i));
-// }
-// return retour;
-// }
-
-hashMapInfos terCalc::BuildWordMatches ( vector<string> hyp, vector<string> ref )
-{
- hashMap tempHash;
- hashMapInfos retour;
- for ( int i = 0; i < ( int ) hyp.size(); i++ ) {
- tempHash.addHasher ( hyp.at ( i ), "" );
- }
- bool cor[ref.size() ];
- for ( int i = 0; i < ( int ) ref.size(); i++ ) {
- if ( tempHash.trouve ( ( string ) ref.at ( i ) ) ) {
- cor[i] = true;
- } else {
- cor[i] = false;
- }
- }
- for ( int start = 0; start < ( int ) ref.size(); start++ ) {
- if ( cor[start] ) {
- for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= MAX_SHIFT_SIZE ) && ( cor[end] ) ); end++ ) {
- vector<string> ajouter = subVector ( ref, start, end + 1 );
- string ajouterString = vectorToString ( ajouter );
- vector<int> values = retour.getValue ( ajouterString );
- values.push_back ( start );
- if ( values.size() > 1 ) {
- retour.setValue ( ajouterString, values );
- } else {
- retour.addValue ( ajouterString, values );
+ bool terCalc::trouverIntersection ( vecInt refSpan, vecInt hypSpan )
+ {
+ if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) )
+ {
+ return true;
}
- }
+ return false;
}
- }
- return retour;
-}
-
-bool terCalc::spanIntersection ( vecInt refSpan, vecInt hypSpan )
-{
- if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) ) {
- return true;
- }
- return false;
-}
-terAlignment terCalc::MinEditDist ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans )
-{
- double current_best = INF;
- double last_best = INF;
- int first_good = 0;
- int current_first_good = 0;
- int last_good = -1;
- int cur_last_good = 0;
- int last_peak = 0;
- int cur_last_peak = 0;
- int i, j;
- double cost, icost, dcost;
- double score;
-
-// int hwsize = hyp.size()-1;
-// int rwsize = ref.size()-1;
- NUM_BEAM_SEARCH_CALLS++;
-// if ((ref.size()+1 > sizeof(S)) || (hyp.size()+1 > sizeof(S)))
-// {
-// int max = ref.size();
-// if (hyp.size() > ref.size()) max = hyp.size();
-// max += 26; // we only need a +1 here, but let's pad for future use
-// S = new double[max][max];
-// P = new char[max][max];
-// }
- for ( i = 0; i <= ( int ) ref.size(); i++ ) {
- for ( j = 0; j <= ( int ) hyp.size(); j++ ) {
- S[i][j] = -1.0;
- P[i][j] = '0';
- }
- }
- S[0][0] = 0.0;
- for ( j = 0; j <= ( int ) hyp.size(); j++ ) {
- last_best = current_best;
- current_best = INF;
- first_good = current_first_good;
- current_first_good = -1;
- last_good = cur_last_good;
- cur_last_good = -1;
- last_peak = cur_last_peak;
- cur_last_peak = 0;
- for ( i = first_good; i <= ( int ) ref.size(); i++ ) {
- if ( i > last_good ) {
- break;
- }
- if ( S[i][j] < 0 ) {
- continue;
- }
- score = S[i][j];
- if ( ( j < ( int ) hyp.size() ) && ( score > last_best + BEAM_WIDTH ) ) {
- continue;
- }
- if ( current_first_good == -1 ) {
- current_first_good = i ;
- }
- if ( ( i < ( int ) ref.size() ) && ( j < ( int ) hyp.size() ) ) {
- if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || spanIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) {
- if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) {
- cost = match_cost + score;
- if ( ( S[i+1][j+1] == -1 ) || ( cost < S[i+1][j+1] ) ) {
- S[i+1][j+1] = cost;
- P[i+1][j+1] = ' ';
- }
- if ( cost < current_best ) {
- current_best = cost;
+ terAlignment terCalc::minimizeDistanceEdition ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans )
+ {
+ double current_best = infinite;
+ double last_best = infinite;
+ int first_good = 0;
+ int current_first_good = 0;
+ int last_good = -1;
+ int cur_last_good = 0;
+ int last_peak = 0;
+ int cur_last_peak = 0;
+ int i, j;
+ double cost, icost, dcost;
+ double score;
+
+
+
+ NBR_BS_APPELS++;
+
+
+ for ( i = 0; i <= ( int ) ref.size(); i++ )
+ {
+ for ( j = 0; j <= ( int ) hyp.size(); j++ )
+ {
+ S[i][j] = -1.0;
+ P[i][j] = '0';
}
- if ( current_best == cost ) {
- cur_last_peak = i + 1;
+ }
+ S[0][0] = 0.0;
+ for ( j = 0; j <= ( int ) hyp.size(); j++ )
+ {
+ last_best = current_best;
+ current_best = infinite;
+ first_good = current_first_good;
+ current_first_good = -1;
+ last_good = cur_last_good;
+ cur_last_good = -1;
+ last_peak = cur_last_peak;
+ cur_last_peak = 0;
+ for ( i = first_good; i <= ( int ) ref.size(); i++ )
+ {
+ if ( i > last_good )
+ {
+ break;
+ }
+ if ( S[i][j] < 0 )
+ {
+ continue;
+ }
+ score = S[i][j];
+ if ( ( j < ( int ) hyp.size() ) && ( score > last_best + TAILLE_BEAM ) )
+ {
+ continue;
+ }
+ if ( current_first_good == -1 )
+ {
+ current_first_good = i ;
+ }
+ if ( ( i < ( int ) ref.size() ) && ( j < ( int ) hyp.size() ) )
+ {
+ if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) )
+ {
+ if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 )
+ {
+ cost = match_cost + score;
+ if ( ( S[i+1][j+1] == -1 ) || ( cost < S[i+1][j+1] ) )
+ {
+ S[i+1][j+1] = cost;
+ P[i+1][j+1] = 'A';
+ }
+ if ( cost < current_best )
+ {
+ current_best = cost;
+ }
+ if ( current_best == cost )
+ {
+ cur_last_peak = i + 1;
+ }
+ }
+ else
+ {
+ cost = substitute_cost + score;
+ if ( ( S[i+1][j+1] < 0 ) || ( cost < S[i+1][j+1] ) )
+ {
+ S[i+1][j+1] = cost;
+ P[i+1][j+1] = 'S';
+ if ( cost < current_best )
+ {
+ current_best = cost;
+ }
+ if ( current_best == cost )
+ {
+ cur_last_peak = i + 1 ;
+ }
+ }
+ }
+ }
+ }
+ cur_last_good = i + 1;
+ if ( j < ( int ) hyp.size() )
+ {
+ icost = score + insert_cost;
+ if ( ( S[i][j+1] < 0 ) || ( S[i][j+1] > icost ) )
+ {
+ S[i][j+1] = icost;
+ P[i][j+1] = 'I';
+ if ( ( cur_last_peak < i ) && ( current_best == icost ) )
+ {
+ cur_last_peak = i;
+ }
+ }
+ }
+ if ( i < ( int ) ref.size() )
+ {
+ dcost = score + delete_cost;
+ if ( ( S[ i+1][ j] < 0.0 ) || ( S[i+1][j] > dcost ) )
+ {
+ S[i+1][j] = dcost;
+ P[i+1][j] = 'D';
+ if ( i >= last_good )
+ {
+ last_good = i + 1 ;
+ }
+ }
+ }
}
- } else {
- cost = substitute_cost + score;
- if ( ( S[i+1][j+1] < 0 ) || ( cost < S[i+1][j+1] ) ) {
- S[i+1][j+1] = cost;
- P[i+1][j+1] = 'S';
- if ( cost < current_best ) {
- current_best = cost;
- }
- if ( current_best == cost ) {
- cur_last_peak = i + 1 ;
- }
+ }
+
+
+ int tracelength = 0;
+ i = ref.size();
+ j = hyp.size();
+ while ( ( i > 0 ) || ( j > 0 ) )
+ {
+ tracelength++;
+ if ( P[i][j] == 'A' )
+ {
+ i--;
+ j--;
}
- }
+ else
+ if ( P[i][j] == 'S' )
+ {
+ i--;
+ j--;
+ }
+ else
+ if ( P[i][j] == 'D' )
+ {
+ i--;
+ }
+ else
+ if ( P[i][j] == 'I' )
+ {
+ j--;
+ }
+ else
+ {
+ cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " << P[i][j] << endl;
+ exit ( -1 );
+ }
}
- }
- cur_last_good = i + 1;
- if ( j < ( int ) hyp.size() ) {
- icost = score + insert_cost;
- if ( ( S[i][j+1] < 0 ) || ( S[i][j+1] > icost ) ) {
- S[i][j+1] = icost;
- P[i][j+1] = 'I';
- if ( ( cur_last_peak < i ) && ( current_best == icost ) ) {
- cur_last_peak = i;
- }
+ vector<char> path ( tracelength );
+ i = ref.size();
+ j = hyp.size();
+ while ( ( i > 0 ) || ( j > 0 ) )
+ {
+ path[--tracelength] = P[i][j];
+ if ( P[i][j] == 'A' )
+ {
+ i--;
+ j--;
+ }
+ else
+ if ( P[i][j] == 'S' )
+ {
+ i--;
+ j--;
+ }
+ else
+ if ( P[i][j] == 'D' )
+ {
+ i--;
+ }
+ else
+ if ( P[i][j] == 'I' )
+ {
+ j--;
+ }
}
- }
- if ( i < ( int ) ref.size() ) {
- dcost = score + delete_cost;
- if ( ( S[ i+1][ j] < 0.0 ) || ( S[i+1][j] > dcost ) ) {
- S[i+1][j] = dcost;
- P[i+1][j] = 'D';
- if ( i >= last_good ) {
- last_good = i + 1 ;
- }
+ terAlignment to_return;
+ to_return.numWords = ref.size();
+ to_return.alignment = path;
+ to_return.numEdits = S[ref.size() ][hyp.size() ];
+ to_return.hyp = hyp;
+ to_return.ref = ref;
+ to_return.averageWords = (int)ref.size();
+ if ( PRINT_DEBUG )
+ {
+ cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl;
}
- }
- }
- }
-
+ return to_return;
- int tracelength = 0;
- i = ref.size();
- j = hyp.size();
- while ( ( i > 0 ) || ( j > 0 ) ) {
- tracelength++;
- if ( P[i][j] == ' ' ) {
- i--;
- j--;
- } else if ( P[i][j] == 'S' ) {
- i--;
- j--;
- } else if ( P[i][j] == 'D' ) {
- i--;
- } else if ( P[i][j] == 'I' ) {
- j--;
- } else {
- cerr << "ERROR : terCalc::MinEditDist : Invalid path : " << P[i][j] << endl;
- exit ( -1 );
}
- }
- vector<char> path ( tracelength );
- i = ref.size();
- j = hyp.size();
- while ( ( i > 0 ) || ( j > 0 ) ) {
- path[--tracelength] = P[i][j];
- if ( P[i][j] == ' ' ) {
- i--;
- j--;
- } else if ( P[i][j] == 'S' ) {
- i--;
- j--;
- } else if ( P[i][j] == 'D' ) {
- i--;
- } else if ( P[i][j] == 'I' ) {
- j--;
- }
- }
- terAlignment to_return;
- to_return.numWords = ref.size();
- to_return.alignment = path;
- to_return.numEdits = S[ref.size() ][hyp.size() ];
- if ( PRINT_DEBUG ) {
- cerr << "BEGIN DEBUG : terCalc::MinEditDist : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl;
- }
- return to_return;
-
-}
-terAlignment terCalc::TER ( vector<string> hyp, vector<string> ref )
-{
- hashMapInfos rloc = BuildWordMatches ( hyp, ref );
- terAlignment cur_align = MinEditDist ( hyp, ref, hypSpans );
- vector<string> cur = hyp;
- cur_align.hyp = hyp;
- cur_align.ref = ref;
- cur_align.aftershift = hyp;
- double edits = 0;
+ terAlignment terCalc::TER ( vector<string> hyp, vector<string> ref )
+ {
+ hashMapInfos rloc = createConcordMots ( hyp, ref );
+ terAlignment cur_align = minimizeDistanceEdition ( hyp, ref, hypSpans );
+ vector<string> cur = hyp;
+ cur_align.hyp = hyp;
+ cur_align.ref = ref;
+ cur_align.aftershift = hyp;
+ double edits = 0;
// int numshifts = 0;
- vector<terShift> allshifts;
+ vector<terShift> allshifts;
// cerr << "Initial Alignment:" << endl << cur_align.toString() <<endl;
- if ( PRINT_DEBUG ) {
- cerr << "BEGIN DEBUG : terCalc::TER : cur_align :" << endl << cur_align.toString() << endl << "END DEBUG" << endl;
- }
- while ( true ) {
- bestShiftStruct returns;
- returns = CalcBestShift ( cur, hyp, ref, rloc, cur_align );
- if ( returns.m_empty ) {
- break;
- }
- terShift bestShift = returns.m_best_shift;
- cur_align = returns.m_best_align;
- edits += bestShift.cost;
- bestShift.alignment = cur_align.alignment;
- bestShift.aftershift = cur_align.aftershift;
- allshifts.push_back ( bestShift );
- cur = cur_align.aftershift;
- }
- terAlignment to_return;
- to_return = cur_align;
- to_return.allshifts = allshifts;
- to_return.numEdits += edits;
- NUM_SEGMENTS_SCORED++;
- return to_return;
-}
-bestShiftStruct terCalc::CalcBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment med_align )
-{
- bestShiftStruct to_return;
- bool anygain = false;
- bool herr[ ( int ) hyp.size() ];
- bool rerr[ ( int ) ref.size() ];
- int ralign[ ( int ) ref.size() ];
- FindAlignErr ( med_align, herr, rerr, ralign );
- vector<vecTerShift> poss_shifts;
- poss_shifts = GatherAllPossShifts ( cur, ref, rloc, med_align, herr, rerr, ralign );
- double curerr = med_align.numEdits;
- if ( PRINT_DEBUG ) {
- cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
- cerr << "Possible Shifts:" << endl;
- for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) {
- for ( int j = 0; j < ( int ) ( poss_shifts.at ( i ) ).size(); j++ ) {
- cerr << " [" << i << "] " << ( ( poss_shifts.at ( i ) ).at ( j ) ).toString() << endl;
- }
- }
- cerr << endl;
- cerr << "END DEBUG " << endl;
- }
- double cur_best_shift_cost = 0.0;
- terAlignment cur_best_align = med_align;
- terShift cur_best_shift;
-
-
-
- for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) {
- if ( PRINT_DEBUG ) {
- cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
- cerr << "Considering shift of length " << i << " (" << ( poss_shifts.at ( i ) ).size() << ")" << endl;
- cerr << "END DEBUG " << endl;
- }
- /* Consider shifts of length i+1 */
- double curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits );
- double maxfix = ( 2 * ( 1 + i ) );
- if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
- break;
- }
-
- for ( int s = 0; s < ( int ) ( poss_shifts.at ( i ) ).size(); s++ ) {
- curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits );
- if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
- break;
- }
- terShift curshift = ( poss_shifts.at ( i ) ).at ( s );
-
- alignmentStruct shiftReturns = PerformShift ( cur, curshift );
- vector<string> shiftarr = shiftReturns.nwords;
- vector<vecInt> curHypSpans = shiftReturns.aftershift;
-
- terAlignment curalign = MinEditDist ( shiftarr, ref, curHypSpans );
-
- curalign.hyp = hyp;
- curalign.ref = ref;
- curalign.aftershift = shiftarr;
-
- double gain = ( cur_best_align.numEdits + cur_best_shift_cost ) - ( curalign.numEdits + curshift.cost );
-
- // if (DEBUG) {
- // string testeuh=terAlignment join(" ", shiftarr);
- if ( PRINT_DEBUG ) {
- cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
- cerr << "Gain for " << curshift.toString() << " is " << gain << ". (result: [" << curalign.join ( " ", shiftarr ) << "]" << endl;
- cerr << "" << curalign.toString() << "\n" << endl;
- cerr << "END DEBUG " << endl;
- }
- // }
- //
- if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) ) {
- anygain = true;
- cur_best_shift = curshift;
- cur_best_shift_cost = curshift.cost;
- cur_best_align = curalign;
- // if (DEBUG)
- if ( PRINT_DEBUG ) {
- cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
- cerr << "Tmp Choosing shift: " << cur_best_shift.toString() << " gives:\n" << cur_best_align.toString() << "\n" << endl;
- cerr << "END DEBUG " << endl;
+ if ( PRINT_DEBUG )
+ {
+ cerr << "BEGIN DEBUG : terCalc::TER : cur_align :" << endl << cur_align.toString() << endl << "END DEBUG" << endl;
}
- }
- }
- }
- if ( anygain ) {
- to_return.m_best_shift = cur_best_shift;
- to_return.m_best_align = cur_best_align;
- to_return.m_empty = false;
- } else {
- to_return.m_empty = true;
- }
- return to_return;
-}
-
-void terCalc::FindAlignErr ( terAlignment align, bool* herr, bool* rerr, int* ralign )
-{
- int hpos = -1;
- int rpos = -1;
- if ( PRINT_DEBUG ) {
-
- cerr << "BEGIN DEBUG : terCalc::FindAlignErr : " << endl << align.toString() << endl;
- cerr << "END DEBUG " << endl;
- }
- for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) {
- char sym = align.alignment[i];
- if ( sym == ' ' ) {
- hpos++;
- rpos++;
- herr[hpos] = false;
- rerr[rpos] = false;
- ralign[rpos] = hpos;
- } else if ( sym == 'S' ) {
- hpos++;
- rpos++;
- herr[hpos] = true;
- rerr[rpos] = true;
- ralign[rpos] = hpos;
- } else if ( sym == 'I' ) {
- hpos++;
- herr[hpos] = true;
- } else if ( sym == 'D' ) {
- rpos++;
- rerr[rpos] = true;
- ralign[rpos] = hpos;
- } else {
- cerr << "ERROR : terCalc::FindAlignErr : Invalid mini align sequence " << sym << " at pos " << i << endl;
- exit ( -1 );
- }
- }
-}
-
-vector<vecTerShift> terCalc::GatherAllPossShifts ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign )
-{
- vector<vecTerShift> to_return;
- // Don't even bother to look if shifts can't be done
- if ( ( MAX_SHIFT_SIZE <= 0 ) || ( MAX_SHIFT_DIST <= 0 ) ) {
-// terShift[][] to_return = new terShift[0][];
- return to_return;
- }
+ while ( true )
+ {
+ bestShiftStruct returns;
+ returns = findBestShift ( cur, hyp, ref, rloc, cur_align );
+ if ( returns.m_empty )
+ {
+ break;
+ }
+ terShift bestShift = returns.m_best_shift;
+ cur_align = returns.m_best_align;
+ edits += bestShift.cost;
+ bestShift.alignment = cur_align.alignment;
+ bestShift.aftershift = cur_align.aftershift;
+ allshifts.push_back ( bestShift );
+ cur = cur_align.aftershift;
+ }
+ terAlignment to_return;
+ to_return = cur_align;
+ to_return.allshifts = allshifts;
+ to_return.numEdits += edits;
+ NBR_SEGS_EVALUATED++;
+ return to_return;
+ }
+ bestShiftStruct terCalc::findBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment med_align )
+ {
+ bestShiftStruct to_return;
+ bool anygain = false;
+ bool herr[ ( int ) hyp.size() ];
+ bool rerr[ ( int ) ref.size() ];
+ int ralign[ ( int ) ref.size() ];
+ calculateTerAlignment ( med_align, herr, rerr, ralign );
+ vector<vecTerShift> poss_shifts;
+
+ if ( PRINT_DEBUG )
+ {
+ cerr << "BEGIN DEBUG : terCalc::findBestShift (after the calculateTerAlignment call) :" << endl;
+ cerr << "indices: ";
+ for (int l_i=0; l_i < ( int ) ref.size() ; l_i++)
+ {
+ cerr << l_i << "\t";
+ }
+ cerr << endl;
+ cerr << "hyp : \t"<<vectorToString(hyp ,"\t") << endl;
+ cerr << "cur : \t"<<vectorToString(cur ,"\t") << endl;
+ cerr << "ref : \t"<<vectorToString(ref ,"\t") << endl;
+ cerr << "herr : "<<vectorToString(herr,"\t",( int ) hyp.size()) << " | " << ( int ) hyp.size() <<endl;
+ cerr << "rerr : "<<vectorToString(rerr,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() <<endl;
+ cerr << "ralign : "<< vectorToString(ralign,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() << endl;
+ cerr << "END DEBUG " << endl;
+ }
+ poss_shifts = calculerPermutations ( cur, ref, rloc, med_align, herr, rerr, ralign );
+ double curerr = med_align.numEdits;
+ if ( PRINT_DEBUG )
+ {
+ cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
+ cerr << "Possible Shifts:" << endl;
+ for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- )
+ {
+ for ( int j = 0; j < ( int ) ( poss_shifts.at ( i ) ).size(); j++ )
+ {
+ cerr << " [" << i << "] " << ( ( poss_shifts.at ( i ) ).at ( j ) ).toString() << endl;
+ }
+ }
+ cerr << endl;
+ cerr << "END DEBUG " << endl;
+ }
+// exit(0);
+ double cur_best_shift_cost = 0.0;
+ terAlignment cur_best_align = med_align;
+ terShift cur_best_shift;
- vector<vecTerShift> allshifts ( MAX_SHIFT_SIZE + 1 );
-// ArrayList[] allshifts = new ArrayList[MAX_SHIFT_SIZE+1];
-// for (int i = 0; i < allshifts.length; i++)
-// {
-// allshifts[i] = new ArrayList();
-// }
-// List hyplist = Arrays.asList(hyp);
- for ( int start = 0; start < ( int ) hyp.size(); start++ ) {
- string subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) );
- if ( ! rloc.trouve ( subVectorHypString ) ) {
- continue;
- }
+ for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- )
+ {
+ if ( PRINT_DEBUG )
+ {
+ cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
+ cerr << "Considering shift of length " << i << " (" << ( poss_shifts.at ( i ) ).size() << ")" << endl;
+ cerr << "END DEBUG " << endl;
+ }
+ /* Consider shifts of length i+1 */
+ double curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits );
+ double maxfix = ( 2 * ( 1 + i ) );
+ if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) )
+ {
+ break;
+ }
- bool ok = false;
- vector<int> mtiVec = rloc.getValue ( subVectorHypString );
- vector<int>::iterator mti = mtiVec.begin();
- while ( mti != mtiVec.end() && ( ! ok ) ) {
- int moveto = ( *mti );
- mti++;
- if ( ( start != ralign[moveto] ) && ( ( ralign[moveto] - start ) <= MAX_SHIFT_DIST ) && ( ( start - ralign[moveto] - 1 ) <= MAX_SHIFT_DIST ) ) {
- ok = true;
- }
- }
- if ( ! ok ) {
- continue;
+ for ( int s = 0; s < ( int ) ( poss_shifts.at ( i ) ).size(); s++ )
+ {
+ curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits );
+ if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) )
+ {
+ break;
+ }
+ terShift curshift = ( poss_shifts.at ( i ) ).at ( s );
+ if ( PRINT_DEBUG )
+ {
+ cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
+ cerr << "cur : "<< join(" ",cur) << endl;
+ cerr << "curshift : "<< curshift.toString() << endl;
+
+ }
+ alignmentStruct shiftReturns = permuter ( cur, curshift );
+ vector<string> shiftarr = shiftReturns.nwords;
+ vector<vecInt> curHypSpans = shiftReturns.aftershift;
+
+ if ( PRINT_DEBUG )
+ {
+ cerr << "shiftarr : "<< join(" ",shiftarr) << endl;
+// cerr << "curHypSpans : "<< curHypSpans.toString() << endl;
+ cerr << "END DEBUG " << endl;
+ }
+ terAlignment curalign = minimizeDistanceEdition ( shiftarr, ref, curHypSpans );
+
+ curalign.hyp = hyp;
+ curalign.ref = ref;
+ curalign.aftershift = shiftarr;
+
+
+ double gain = ( cur_best_align.numEdits + cur_best_shift_cost ) - ( curalign.numEdits + curshift.cost );
+
+ // if (DEBUG) {
+ // string testeuh=terAlignment join(" ", shiftarr);
+ if ( PRINT_DEBUG )
+ {
+ cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
+ cerr << "Gain for " << curshift.toString() << " is " << gain << ". (result: [" << curalign.join ( " ", shiftarr ) << "]" << endl;
+ cerr << "Details of gains : gain = ( cur_best_align.numEdits + cur_best_shift_cost ) - ( curalign.numEdits + curshift.cost )"<<endl;
+ cerr << "Details of gains : gain = ("<<cur_best_align.numEdits << "+" << cur_best_shift_cost << ") - (" << curalign.numEdits << "+" << curshift.cost << ")"<<endl;
+ cerr << "" << curalign.toString() << "\n" << endl;
+ cerr << "END DEBUG " << endl;
+ }
+ // }
+ //
+ if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) )
+ {
+ anygain = true;
+ cur_best_shift = curshift;
+ cur_best_shift_cost = curshift.cost;
+ cur_best_align = curalign;
+ // if (DEBUG)
+ if ( PRINT_DEBUG )
+ {
+ cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
+ cerr << "Tmp Choosing shift: " << cur_best_shift.toString() << " gives:\n" << cur_best_align.toString() << "\n" << endl;
+ cerr << "END DEBUG " << endl;
+ }
+ }
+ }
+ }
+ if ( anygain )
+ {
+ to_return.m_best_shift = cur_best_shift;
+ to_return.m_best_align = cur_best_align;
+ to_return.m_empty = false;
+ }
+ else
+ {
+ to_return.m_empty = true;
+ }
+ return to_return;
}
- ok = true;
- for ( int end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + MAX_SHIFT_SIZE ) ); end++ ) {
- /* check if cand is good if so, add it */
- vector<string> cand = subVector ( hyp, start, end + 1 );
- ok = false;
- if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) ) {
- continue;
- }
- bool any_herr = false;
+ void terCalc::calculateTerAlignment ( terAlignment align, bool* herr, bool* rerr, int* ralign )
+ {
+ int hpos = -1;
+ int rpos = -1;
+ if ( PRINT_DEBUG )
+ {
- for ( int i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ ) {
- if ( herr[start+i] ) {
- any_herr = true;
+ cerr << "BEGIN DEBUG : terCalc::calculateTerAlignment : " << endl << align.toString() << endl;
+ cerr << "END DEBUG " << endl;
}
- }
- if ( any_herr == false ) {
- ok = true;
- continue;
- }
-
- vector<int> movetoitVec;
- movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) );
- vector<int>::iterator movetoit = movetoitVec.begin();
- while ( movetoit != movetoitVec.end() ) {
- int moveto = ( *movetoit );
- movetoit++;
- if ( ! ( ( ralign[moveto] != start ) && ( ( ralign[moveto] < start ) || ( ralign[moveto] > end ) ) && ( ( ralign[moveto] - start ) <= MAX_SHIFT_DIST ) && ( ( start - ralign[moveto] ) <= MAX_SHIFT_DIST ) ) ) {
- continue;
+ for ( int i = 0; i < ( int ) align.alignment.size(); i++ )
+ {
+ herr[i] = false;
+ rerr[i] = false;
+ ralign[i] = -1;
+ }
+ for ( int i = 0; i < ( int ) align.alignment.size(); i++ )
+ {
+ char sym = align.alignment[i];
+ if ( sym == 'A' )
+ {
+ hpos++;
+ rpos++;
+ herr[hpos] = false;
+ rerr[rpos] = false;
+ ralign[rpos] = hpos;
+ }
+ else
+ if ( sym == 'S' )
+ {
+ hpos++;
+ rpos++;
+ herr[hpos] = true;
+ rerr[rpos] = true;
+ ralign[rpos] = hpos;
+ }
+ else
+ if ( sym == 'I' )
+ {
+ hpos++;
+ herr[hpos] = true;
+ }
+ else
+ if ( sym == 'D' )
+ {
+ rpos++;
+ rerr[rpos] = true;
+ ralign[rpos] = hpos+1;
+ }
+ else
+ {
+ cerr << "ERROR : terCalc::calculateTerAlignment : Invalid mini align sequence " << sym << " at pos " << i << endl;
+ exit ( -1 );
+ }
}
- ok = true;
-
- /* check to see if there are any errors in either string
- (only move if this is the case!)
- */
+ }
- bool any_rerr = false;
- for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ ) {
- if ( rerr[moveto+i] ) {
- any_rerr = true;
- }
- }
- if ( ! any_rerr ) {
- continue;
+ vector<vecTerShift> terCalc::calculerPermutations ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign )
+ {
+ vector<vecTerShift> to_return;
+ if ( ( TAILLE_PERMUT_MAX <= 0 ) || ( DIST_MAX_PERMUT <= 0 ) )
+ {
+ return to_return;
}
- for ( int roff = -1; roff <= ( end - start ); roff++ ) {
- terShift topush;
- bool topushNull = true;
- if ( ( roff == -1 ) && ( moveto == 0 ) ) {
- if ( PRINT_DEBUG ) {
- cerr << "BEGIN DEBUG : terCalc::GatherAllPossShifts 01 : " << endl << "Consider making " << start << "..." << end << " moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl;
+ vector<vecTerShift> allshifts ( TAILLE_PERMUT_MAX + 1 );
+ for ( int start = 0; start < ( int ) hyp.size(); start++ )
+ {
+ string subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) );
+ if ( ! rloc.trouve ( subVectorHypString ) )
+ {
+ continue;
}
- terShift t01 ( start, end, -1, -1 );
- topush = t01;
- topushNull = false;
- } else if ( ( start != ralign[moveto+roff] ) && ( ( roff == 0 ) || ( ralign[moveto+roff] != ralign[moveto] ) ) ) {
- int newloc = ralign[moveto+roff];
- if ( PRINT_DEBUG ) {
- cerr << "BEGIN DEBUG : terCalc::GatherAllPossShifts 02 : " << endl << "Consider making " << start << "..." << end << " moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl;
+ bool ok = false;
+ vector<int> mtiVec = rloc.getValue ( subVectorHypString );
+ vector<int>::iterator mti = mtiVec.begin();
+ while ( mti != mtiVec.end() && ( ! ok ) )
+ {
+ int moveto = ( *mti );
+ mti++;
+ if ( ( start != ralign[moveto] ) && ( ( ralign[moveto] - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign[moveto] - 1 ) <= DIST_MAX_PERMUT ) )
+ {
+ ok = true;
+ }
}
- terShift t02 ( start, end, moveto + roff, newloc );
- topush = t02;
- topushNull = false;
- }
- if ( !topushNull ) {
- topush.shifted = cand;
- topush.cost = shift_cost;
- if ( PRINT_DEBUG ) {
-
- cerr << "BEGIN DEBUG : terCalc::GatherAllPossShifts 02 : " << endl;
- cerr << "start : " << start << endl;
- cerr << "end : " << end << endl;
- cerr << "end - start : " << end - start << endl;
- cerr << "END DEBUG " << endl;
+ if ( ! ok )
+ {
+ continue;
+ }
+ ok = true;
+ for ( int end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + TAILLE_PERMUT_MAX ) ); end++ )
+ {
+ /* check if cand is good if so, add it */
+ vector<string> cand = subVector ( hyp, start, end + 1 );
+ ok = false;
+ if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) )
+ {
+ continue;
+ }
+
+ bool any_herr = false;
+
+ for ( int i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ )
+ {
+ if ( herr[start+i] )
+ {
+ any_herr = true;
+ }
+ }
+ if ( any_herr == false )
+ {
+ ok = true;
+ continue;
+ }
+
+ vector<int> movetoitVec;
+ movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) );
+// cerr << "CANDIDATE " << ( string ) vectorToString ( cand ) <<" PLACED : " << ( string ) vectorToString ( movetoitVec," ") << endl;
+ vector<int>::iterator movetoit = movetoitVec.begin();
+ while ( movetoit != movetoitVec.end() )
+ {
+ int moveto = ( *movetoit );
+ movetoit++;
+ if ( ! ( ( ralign[moveto] != start ) && ( ( ralign[moveto] < start ) || ( ralign[moveto] > end ) ) && ( ( ralign[moveto] - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign[moveto] ) <= DIST_MAX_PERMUT ) ) )
+ {
+ continue;
+ }
+ ok = true;
+
+ /* check to see if there are any errors in either string
+ (only move if this is the case!)
+ */
+
+ bool any_rerr = false;
+ for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ )
+ {
+ if ( rerr[moveto+i] )
+ {
+ any_rerr = true;
+ }
+ }
+ if ( ! any_rerr )
+ {
+ continue;
+ }
+ for ( int roff = -1; roff <= ( end - start ); roff++ )
+ {
+ terShift topush;
+ bool topushNull = true;
+ if ( ( roff == -1 ) && ( moveto == 0 ) )
+ {
+ if ( PRINT_DEBUG )
+ {
+
+ cerr << "BEGIN DEBUG : terCalc::calculerPermutations 01 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl;
+ }
+ terShift t01 ( start, end, -1, -1 );
+ topush = t01;
+ topushNull = false;
+ }
+ else
+ if ( ( start != ralign[moveto+roff] ) && ( ( roff == 0 ) || ( ralign[moveto+roff] != ralign[moveto] ) ) )
+ {
+ int newloc = ralign[moveto+roff];
+ if ( PRINT_DEBUG )
+ {
+
+ cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl;
+ }
+ terShift t02 ( start, end, moveto + roff, newloc );
+ topush = t02;
+ topushNull = false;
+ }
+ if ( !topushNull )
+ {
+ topush.shifted = cand;
+ topush.cost = shift_cost;
+ if ( PRINT_DEBUG )
+ {
+
+ cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl;
+ cerr << "start : " << start << endl;
+ cerr << "end : " << end << endl;
+ cerr << "end - start : " << end - start << endl;
+ cerr << "END DEBUG " << endl;
+ }
+ ( allshifts.at ( end - start ) ).push_back ( topush );
+ }
+ }
+ }
}
- ( allshifts.at ( end - start ) ).push_back ( topush );
- }
}
- }
+ to_return.clear();
+ for ( int i = 0; i < TAILLE_PERMUT_MAX + 1; i++ )
+ {
+ to_return.push_back ( ( vecTerShift ) allshifts.at ( i ) );
+ }
+ return to_return;
}
- }
-// vector<vecTerShift> to_return;
- to_return.clear();
-// terShift[][] to_return = new terShift[MAX_SHIFT_SIZE+1][];
- for ( int i = 0; i < MAX_SHIFT_SIZE + 1; i++ ) {
-// to_return[i] = (terShift[]) allshifts[i].toArray(new terShift[0]);
- to_return.push_back ( ( vecTerShift ) allshifts.at ( i ) );
- }
- return to_return;
-}
-alignmentStruct terCalc::PerformShift ( vector<string> words, terShift s )
-{
- return PerformShift ( words, s.start, s.end, s.newloc );
-}
-
+ alignmentStruct terCalc::permuter ( vector<string> words, terShift s )
+ {
+ return permuter ( words, s.start, s.end, s.newloc );
+ }
-alignmentStruct terCalc::PerformShift ( vector<string> words, int start, int end, int newloc )
-{
- int c = 0;
- vector<string> nwords ( words );
- vector<vecInt> spans ( ( int ) hypSpans.size() );
- alignmentStruct toreturn;
-// ON EST ICI
-// if((int)hypSpans.size()>0) spans = new TERintpair[(int)hypSpans.size()];
-// if(DEBUG) {
- if ( PRINT_DEBUG ) {
- if ( ( int ) hypSpans.size() > 0 ) {
- cerr << "BEGIN DEBUG : terCalc::PerformShift :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl << "END DEBUG " << endl;
- } else {
- cerr << "BEGIN DEBUG : terCalc::PerformShift :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl << "END DEBUG " << endl;
- }
- }
+ alignmentStruct terCalc::permuter ( vector<string> words, int start, int end, int newloc )
+ {
+ int c = 0;
+ vector<string> nwords ( words );
+ vector<vecInt> spans ( ( int ) hypSpans.size() );
+ alignmentStruct to_return;
+ if ( PRINT_DEBUG )
+ {
+
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl ;
+ }
+ else
+ {
+ cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl ;
+ }
+ cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << join(" ",words) << " start: " << start << " end: " << end << " newloc "<< newloc << endl << "END DEBUG " << endl;
+ }
+ if (newloc >= ( int ) words.size())
+ {
+ if ( PRINT_DEBUG )
+ {
+ cerr << "WARNING: Relocation over the size of the hypothesis, replacing at the end of it."<<endl;
+ }
+ newloc = ( int ) words.size()-1;
+ }
+
// }
- if ( newloc == -1 ) {
- for ( int i = start; i <= end; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = 0; i <= start - 1; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- } else {
- if ( newloc < start ) {
- for ( int i = 0; i <= newloc; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = start; i <= end; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = newloc + 1; i <= start - 1; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- } else {
- if ( newloc > end ) {
- for ( int i = 0; i <= start - 1; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = end + 1; i <= newloc; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = start; i <= end; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = newloc + 1; i < ( int ) words.size(); i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- } else {
- // we are moving inside of ourselves
- for ( int i = 0; i <= start - 1; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
- }
- for ( int i = start; i <= end; i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
+ if ( newloc == -1 )
+ {
+ for ( int i = start; i <= end;i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = 0; i <= start - 1;i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = end + 1; i < ( int ) words.size();i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
}
- for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size(); i++ ) {
- nwords.at ( c++ ) = words.at ( i );
- if ( ( int ) hypSpans.size() > 0 ) {
- spans.at ( c - 1 ) = hypSpans.at ( i );
- }
+ else
+ {
+ if ( newloc < start )
+ {
+
+ for ( int i = 0; i < newloc; i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = start; i <= end;i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = newloc ; i < start ;i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = end + 1; i < ( int ) words.size();i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ }
+ else
+ {
+ if ( newloc > end )
+ {
+ for ( int i = 0; i <= start - 1; i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = end + 1; i <= newloc;i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = start; i <= end;i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = newloc + 1; i < ( int ) words.size();i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ }
+ else
+ {
+ // we are moving inside of ourselves
+ for ( int i = 0; i <= start - 1; i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = start; i <= end;i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size();i++ )
+ {
+ nwords.at ( c++ ) = words.at ( i );
+ if ( ( int ) hypSpans.size() > 0 )
+ {
+ spans.at ( c - 1 ) = hypSpans.at ( i );
+ }
+ }
+ }
+ }
}
- }
+ NBR_PERMUTS_CONSID++;
+
+ if ( PRINT_DEBUG )
+ {
+ cerr << "nwords" << join(" ",nwords) << endl;
+// cerr << "spans" << spans. << endl;
+ }
+
+ to_return.nwords = nwords;
+ to_return.aftershift = spans;
+ return to_return;
+ }
+ void terCalc::setDebugMode ( bool b )
+ {
+ PRINT_DEBUG = b;
}
- }
- NUM_SHIFTS_CONSIDERED++;
-
- toreturn.nwords = nwords;
- toreturn.aftershift = spans;
- return toreturn;
-}
-void terCalc::setDebugMode ( bool b )
-{
- PRINT_DEBUG = b;
-}
}
diff --git a/mert/TER/tercalc.h b/mert/TER/tercalc.h
index 9e1a01f65..92d9caf2b 100644
--- a/mert/TER/tercalc.h
+++ b/mert/TER/tercalc.h
@@ -1,5 +1,25 @@
-#ifndef MERT_TER_TER_CALC_H_
-#define MERT_TER_TER_CALC_H_
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
+#ifndef _TERCPPTERCALC_H__
+#define _TERCPPTERCALC_H__
#include <vector>
#include <stdio.h>
@@ -21,62 +41,63 @@ namespace TERCpp
{
// typedef size_t WERelement[2];
// Vecteur d'alignement contenant le hash du mot et son evaluation (0=ok, 1=sub, 2=ins, 3=del)
-typedef vector<terShift> vecTerShift;
-/**
- @author
-*/
-class terCalc
-{
-private :
+ typedef vector<terShift> vecTerShift;
+ /**
+ @author
+ */
+ class terCalc
+ {
+ private :
// Vecteur d'alignement contenant le hash du mot et son evaluation (0=ok, 1=sub, 2=ins, 3=del)
- WERalignment l_WERalignment;
-// HashMap contenant les caleurs de hash de chaque mot
- hashMap bagOfWords;
- int MAX_SHIFT_SIZE;
- /* Variables for some internal counting. */
- int NUM_SEGMENTS_SCORED;
- int NUM_SHIFTS_CONSIDERED;
- int NUM_BEAM_SEARCH_CALLS;
- int MAX_SHIFT_DIST;
- bool PRINT_DEBUG;
+ WERalignment l_WERalignment;
+// HashMap contenant les valeurs de hash de chaque mot
+ hashMap bagOfWords;
+ int TAILLE_PERMUT_MAX;
+ // Increments internes
+ int NBR_SEGS_EVALUATED;
+ int NBR_PERMUTS_CONSID;
+ int NBR_BS_APPELS;
+ int DIST_MAX_PERMUT;
+ bool PRINT_DEBUG;
- /* These are resized by the MIN_EDIT_DIST code if they aren't big enough */
- double S[1000][1000];
- char P[1000][1000];
- vector<vecInt> refSpans;
- vector<vecInt> hypSpans;
- int BEAM_WIDTH;
+ // Utilisés dans minDistEdit et ils ne sont pas réajustés
+ double S[1000][1000];
+ char P[1000][1000];
+ vector<vecInt> refSpans;
+ vector<vecInt> hypSpans;
+ int TAILLE_BEAM;
-public:
- int shift_cost;
- int insert_cost;
- int delete_cost;
- int substitute_cost;
- int match_cost;
- double INF;
- terCalc();
+ public:
+ int shift_cost;
+ int insert_cost;
+ int delete_cost;
+ int substitute_cost;
+ int match_cost;
+ double infinite;
+ terCalc();
// ~terCalc();
// size_t* hashVec ( vector<string> s );
- void setDebugMode ( bool b );
- int WERCalculation ( size_t * ref, size_t * hyp );
- int WERCalculation ( vector<string> ref, vector<string> hyp );
- int WERCalculation ( vector<int> ref, vector<int> hyp );
+ void setDebugMode ( bool b );
+// int WERCalculation ( size_t * ref, size_t * hyp );
+// int WERCalculation ( vector<string> ref, vector<string> hyp );
+// int WERCalculation ( vector<int> ref, vector<int> hyp );
+ terAlignment WERCalculation ( vector<string> hyp, vector<string> ref );
// string vectorToString(vector<string> vec);
// vector<string> subVector(vector<string> vec, int start, int end);
- hashMapInfos BuildWordMatches ( vector<string> hyp, vector<string> ref );
- terAlignment MinEditDist ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans );
- bool spanIntersection ( vecInt refSpan, vecInt hypSpan );
- terAlignment TER ( vector<string> hyp, vector<string> ref , float avRefLength );
- terAlignment TER ( vector<string> hyp, vector<string> ref );
- terAlignment TER ( vector<int> hyp, vector<int> ref );
- bestShiftStruct CalcBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment cur_align );
- void FindAlignErr ( terAlignment align, bool* herr, bool* rerr, int* ralign );
- vector<vecTerShift> GatherAllPossShifts ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign );
- alignmentStruct PerformShift ( vector<string> words, terShift s );
- alignmentStruct PerformShift ( vector<string> words, int start, int end, int newloc );
-};
+ hashMapInfos createConcordMots ( vector<string> hyp, vector<string> ref );
+ terAlignment minimizeDistanceEdition ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans );
+ bool trouverIntersection ( vecInt refSpan, vecInt hypSpan );
+ terAlignment TER ( vector<string> hyp, vector<string> ref , float avRefLength );
+ terAlignment TER ( vector<string> hyp, vector<string> ref );
+ terAlignment TER ( vector<int> hyp, vector<int> ref );
+ bestShiftStruct findBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment cur_align );
+ void calculateTerAlignment ( terAlignment align, bool* herr, bool* rerr, int* ralign );
+ vector<vecTerShift> calculerPermutations ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign );
+ alignmentStruct permuter ( vector<string> words, terShift s );
+ alignmentStruct permuter ( vector<string> words, int start, int end, int newloc );
+ };
}
-#endif // MERT_TER_TER_CALC_H_
+#endif
diff --git a/mert/TER/tools.cpp b/mert/TER/tools.cpp
index 2d910ec05..64e1483b6 100644
--- a/mert/TER/tools.cpp
+++ b/mert/TER/tools.cpp
@@ -1,545 +1,772 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#include "tools.h"
using namespace std;
+using namespace boost::xpressive;
namespace Tools
{
-string vectorToString ( vector<string> vec )
-{
- string retour ( "" );
- for ( vector<string>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) {
- if ( vecIter == vec.begin() ) {
- retour += ( *vecIter );
- } else {
- retour += "\t" + ( *vecIter );
+ string vectorToString ( vector<string> vec )
+ {
+ string retour ( "" );
+ for ( vector<string>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ )
+ {
+ if ( vecIter == vec.begin() )
+ {
+ retour += ( *vecIter );
+ }
+ else
+ {
+ retour += "\t" + ( *vecIter );
+ }
+ }
+ return retour;
+ }
+ string vectorToString ( vector<char> vec )
+ {
+ stringstream retour;
+ retour.str("");
+ for ( vector<char>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ )
+ {
+ if ( vecIter == vec.begin() )
+ {
+ retour << ( *vecIter );
+ }
+ else
+ {
+ retour << "\t" << ( *vecIter );
+ }
+ }
+ return retour.str();
+ }
+ string vectorToString ( vector<int> vec )
+ {
+ stringstream retour;
+ retour.str("");
+ for ( vector<int>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ )
+ {
+ if ( vecIter == vec.begin() )
+ {
+ retour << ( *vecIter );
+ }
+ else
+ {
+ retour << "\t" << ( *vecIter );
+ }
+ }
+ return retour.str();
}
- }
- return retour;
-}
-string vectorToString ( vector< string > vec, string s )
-{
- string retour ( "" );
- for ( vector<string>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) {
- if ( vecIter == vec.begin() ) {
- retour += ( *vecIter );
- } else {
- retour += s + ( *vecIter );
+ string vectorToString ( vector< string > vec, string s )
+ {
+ string retour ( "" );
+ for ( vector<string>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ )
+ {
+ if ( vecIter == vec.begin() )
+ {
+ retour += ( *vecIter );
+ }
+ else
+ {
+ retour += s + ( *vecIter );
+ }
+ }
+ return retour;
+
}
- }
- return retour;
-}
+ string vectorToString ( vector< char > vec, string s )
+ {
+ stringstream retour;
+ retour.str("");
+ for ( vector<char>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ )
+ {
+ if ( vecIter == vec.begin() )
+ {
+ retour << ( *vecIter );
+ }
+ else
+ {
+ retour << s << ( *vecIter );
+ }
+ }
+ return retour.str();
-vector<string> subVector ( vector<string> vec, int start, int end )
-{
- vector<string> retour;
- if ( start > end ) {
- cerr << "ERREUR : TERcalc::subVector : end > start" << endl;
- exit ( 0 );
- }
- for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) {
- retour.push_back ( vec.at ( i ) );
- }
- return retour;
-}
+ }
-vector<int> subVector ( vector<int> vec, int start, int end )
-{
- vector<int> retour;
- if ( start > end ) {
- cerr << "ERREUR : TERcalc::subVector : end > start" << endl;
- exit ( 0 );
- }
- for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) {
- retour.push_back ( vec.at ( i ) );
- }
- return retour;
-}
+ string vectorToString ( vector< int > vec, string s )
+ {
+ stringstream retour;
+ retour.str("");
+ for ( vector<int>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ )
+ {
+ if ( vecIter == vec.begin() )
+ {
+ retour << ( *vecIter );
+ }
+ else
+ {
+ retour << s << ( *vecIter );
+ }
+ }
+ return retour.str();
-vector<float> subVector ( vector<float> vec, int start, int end )
-{
- vector<float> retour;
- if ( start > end ) {
- cerr << "ERREUR : TERcalc::subVector : end > start" << endl;
- exit ( 0 );
- }
- for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) {
- retour.push_back ( vec.at ( i ) );
- }
- return retour;
-}
+ }
+
+ string vectorToString ( vector< bool > vec, string s )
+ {
+ stringstream retour;
+ retour.str("");
+ for ( vector<bool>::iterator vecIter = vec.begin();vecIter != vec.end(); vecIter++ )
+ {
+ if ( vecIter == vec.begin() )
+ {
+ retour << ( *vecIter );
+ }
+ else
+ {
+ retour << s << ( *vecIter );
+ }
+ }
+ return retour.str();
-vector<string> copyVector ( vector<string> vec )
-{
- vector<string> retour;
- for ( int i = 0; i < ( int ) vec.size(); i++ ) {
- retour.push_back ( vec.at ( i ) );
- }
- return retour;
-}
-vector<int> copyVector ( vector<int> vec )
-{
- vector<int> retour;
- for ( int i = 0; i < ( int ) vec.size(); i++ ) {
- retour.push_back ( vec.at ( i ) );
- }
- return retour;
-}
-vector<float> copyVector ( vector<float> vec )
-{
- vector<float> retour;
- for ( int i = 0; i < ( int ) vec.size(); i++ ) {
- retour.push_back ( vec.at ( i ) );
- }
- return retour;
-}
-vector<string> stringToVector ( string s, string tok )
-{
- vector<string> to_return;
- string to_push ( "" );
- bool pushed = false;
- string::iterator sIt;
- for ( sIt = s.begin(); sIt < s.end(); sIt++ ) {
- pushed = false;
- for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) {
- if ( ( *sIt ) == ( *sTok ) ) {
- to_return.push_back ( to_push );
- to_push = "";
- pushed = true;
- }
}
- if ( !pushed ) {
- to_push.push_back ( ( *sIt ) );
+ string vectorToString ( char* vec, string s , int taille)
+ {
+ stringstream retour;
+ retour.str("");
+ int l_i;
+ for ( l_i=0; l_i < taille ; l_i++)
+ {
+ if ( l_i == 0 )
+ {
+ retour << vec[l_i];
+ }
+ else
+ {
+ retour << s << vec[l_i];
+ }
+ }
+ return retour.str();
+
}
- }
- to_return.push_back ( to_push );
- return to_return;
-}
-vector<int> stringToVectorInt ( string s, string tok )
-{
- vector<int> to_return;
- string to_push ( "" );
- bool pushed = false;
- string::iterator sIt;
- for ( sIt = s.begin(); sIt < s.end(); sIt++ ) {
- pushed = false;
- for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) {
- if ( ( *sIt ) == ( *sTok ) ) {
- if ( ( int ) to_push.length() > 0 ) {
- to_return.push_back ( atoi ( to_push.c_str() ) );
+
+ string vectorToString ( int* vec, string s , int taille)
+ {
+ stringstream retour;
+ retour.str("");
+ int l_i;
+ for ( l_i=0; l_i < taille ; l_i++)
+ {
+ if ( l_i == 0 )
+ {
+ retour << vec[l_i];
+ }
+ else
+ {
+ retour << s << vec[l_i];
+ }
}
- to_push = "";
- pushed = true;
- }
+ return retour.str();
+
}
- if ( !pushed ) {
- to_push.push_back ( ( *sIt ) );
+
+ string vectorToString ( bool* vec, string s , int taille)
+ {
+ stringstream retour;
+ retour.str("");
+ int l_i;
+ for ( l_i=0; l_i < taille ; l_i++)
+ {
+ if ( l_i == 0 )
+ {
+ retour << vec[l_i];
+ }
+ else
+ {
+ retour << s << vec[l_i];
+ }
+ }
+ return retour.str();
+
}
- }
- if ( ( int ) to_push.length() > 0 ) {
- to_return.push_back ( atoi ( to_push.c_str() ) );
- }
- return to_return;
-}
-vector<float> stringToVectorFloat ( string s, string tok )
-{
- vector<float> to_return;
- string to_push ( "" );
- bool pushed = false;
- string::iterator sIt;
- for ( sIt = s.begin(); sIt < s.end(); sIt++ ) {
- pushed = false;
- for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) {
- if ( ( *sIt ) == ( *sTok ) ) {
- if ( ( int ) to_push.length() > 0 ) {
- to_return.push_back ( atof ( to_push.c_str() ) );
+
+ vector<string> subVector ( vector<string> vec, int start, int end )
+ {
+ vector<string> retour;
+ if ( start > end )
+ {
+ cerr << "ERREUR : TERcalc::subVector : end > start" << endl;
+ exit ( 0 );
+ }
+ for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ )
+ {
+ retour.push_back ( vec.at ( i ) );
}
- to_push = "";
- pushed = true;
- }
+ return retour;
}
- if ( !pushed ) {
- to_push.push_back ( ( *sIt ) );
+
+ vector<int> subVector ( vector<int> vec, int start, int end )
+ {
+ vector<int> retour;
+ if ( start > end )
+ {
+ cerr << "ERREUR : TERcalc::subVector : end > start" << endl;
+ exit ( 0 );
+ }
+ for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ )
+ {
+ retour.push_back ( vec.at ( i ) );
+ }
+ return retour;
}
- }
- if ( ( int ) to_push.length() > 0 ) {
- to_return.push_back ( atoi ( to_push.c_str() ) );
- }
- return to_return;
-}
-
-string lowerCase ( string str )
-{
- for ( int i = 0; i < ( int ) str.size(); i++ ) {
- if ( ( str[i] >= 0x41 ) && ( str[i] <= 0x5A ) ) {
- str[i] = str[i] + 0x20;
+
+ vector<float> subVector ( vector<float> vec, int start, int end )
+ {
+ vector<float> retour;
+ if ( start > end )
+ {
+ cerr << "ERREUR : TERcalc::subVector : end > start" << endl;
+ exit ( 0 );
+ }
+ for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ )
+ {
+ retour.push_back ( vec.at ( i ) );
+ }
+ return retour;
+ }
+
+ vector<string> copyVector ( vector<string> vec )
+ {
+ vector<string> retour;
+ for ( int i = 0; i < ( int ) vec.size(); i++ )
+ {
+ retour.push_back ( vec.at ( i ) );
+ }
+ return retour;
+ }
+ vector<int> copyVector ( vector<int> vec )
+ {
+ vector<int> retour;
+ for ( int i = 0; i < ( int ) vec.size(); i++ )
+ {
+ retour.push_back ( vec.at ( i ) );
+ }
+ return retour;
+ }
+ vector<float> copyVector ( vector<float> vec )
+ {
+ vector<float> retour;
+ for ( int i = 0; i < ( int ) vec.size(); i++ )
+ {
+ retour.push_back ( vec.at ( i ) );
+ }
+ return retour;
+ }
+ vector<string> stringToVector ( string s, string tok )
+ {
+ vector<string> to_return;
+ string to_push ( "" );
+ bool pushed = false;
+ string::iterator sIt;
+ for ( sIt = s.begin(); sIt < s.end(); sIt++ )
+ {
+ pushed = false;
+ for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ )
+ {
+ if ( ( *sIt ) == ( *sTok ) )
+ {
+ to_return.push_back ( to_push );
+ to_push = "";
+ pushed = true;
+ }
+ }
+ if ( !pushed )
+ {
+ to_push.push_back ( ( *sIt ) );
+ }
+ }
+ to_return.push_back ( to_push );
+ return to_return;
+ }
+ vector<int> stringToVectorInt ( string s, string tok )
+ {
+ vector<int> to_return;
+ string to_push ( "" );
+ bool pushed = false;
+ string::iterator sIt;
+ for ( sIt = s.begin(); sIt < s.end(); sIt++ )
+ {
+ pushed = false;
+ for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ )
+ {
+ if ( ( *sIt ) == ( *sTok ) )
+ {
+ if ( ( int ) to_push.length() > 0 )
+ {
+ to_return.push_back ( atoi ( to_push.c_str() ) );
+ }
+ to_push = "";
+ pushed = true;
+ }
+ }
+ if ( !pushed )
+ {
+ to_push.push_back ( ( *sIt ) );
+ }
+ }
+ if ( ( int ) to_push.length() > 0 )
+ {
+ to_return.push_back ( atoi ( to_push.c_str() ) );
+ }
+ return to_return;
+ }
+ vector<float> stringToVectorFloat ( string s, string tok )
+ {
+ vector<float> to_return;
+ string to_push ( "" );
+ bool pushed = false;
+ string::iterator sIt;
+ for ( sIt = s.begin(); sIt < s.end(); sIt++ )
+ {
+ pushed = false;
+ for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ )
+ {
+ if ( ( *sIt ) == ( *sTok ) )
+ {
+ if ( ( int ) to_push.length() > 0 )
+ {
+ to_return.push_back ( atof ( to_push.c_str() ) );
+ }
+ to_push = "";
+ pushed = true;
+ }
+ }
+ if ( !pushed )
+ {
+ to_push.push_back ( ( *sIt ) );
+ }
+ }
+ if ( ( int ) to_push.length() > 0 )
+ {
+ to_return.push_back ( atoi ( to_push.c_str() ) );
+ }
+ return to_return;
}
- }
- return str;
-}
-/*
-string removePunctTercom ( string str )
-{
- string str_mod = str;
- sregex rex;
- string replace;
+ string lowerCase ( string str )
+ {
+ for ( int i = 0;i < ( int ) str.size();i++ )
+ {
+ if ( ( str[i] >= 0x41 ) && ( str[i] <= 0x5A ) )
+ {
+ str[i] = str[i] + 0x20;
+ }
+ }
+ return str;
+ }
+ string removePunctTercom ( string str )
+ {
+ string str_mod = str;
+ sregex rex;
+ string replace;
- rex = sregex::compile ( "^[ ]+" );
- replace = "";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "^[ ]+" );
+ replace = "";
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\"]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\"]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[,]" );
- replace = " ";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[,]" );
+ replace = " ";
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
- replace = ( "$1 $3" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
+ replace = ( "$1 $3" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
- replace = ( "$1 $3" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
+ replace = ( "$1 $3" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
- replace = ( "$1 $3" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
+ replace = ( "$1 $3" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "([\\.]$)" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "([\\.]$)" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\?]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\?]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\;]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\;]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\:]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\:]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\!]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\!]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\(]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\(]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\)]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\)]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[ ]+" );
- replace = " ";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[ ]+" );
+ replace = " ";
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[ ]+$" );
- replace = "";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[ ]+$" );
+ replace = "";
+ str_mod = regex_replace ( str_mod, rex, replace );
- return str_mod;
-}
-string removePunct ( string str )
-{
- string str_mod = str;
- sregex rex;
- string replace;
+ return str_mod;
+ }
+ string removePunct ( string str )
+ {
+ string str_mod = str;
+ sregex rex;
+ string replace;
- rex = sregex::compile ( "^[ ]+" );
- replace = "";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "^[ ]+" );
+ replace = "";
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\"]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\"]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[,]" );
- replace = " ";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[,]" );
+ replace = " ";
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
- replace = ( "$1 $3" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
+ replace = ( "$1 $3" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
- replace = ( "$1 $3" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
+ replace = ( "$1 $3" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
- replace = ( "$1 $3" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
+ replace = ( "$1 $3" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "([\\.]$)" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "([\\.]$)" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\?]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\?]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\;]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\;]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\:]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\:]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\!]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\!]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\(]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\(]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\)]" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\)]" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[ ]+" );
- replace = " ";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[ ]+" );
+ replace = " ";
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[ ]+$" );
- replace = "";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[ ]+$" );
+ replace = "";
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "^[ ]+" );
- replace = "";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "^[ ]+" );
+ replace = "";
+ str_mod = regex_replace ( str_mod, rex, replace );
- return str_mod;
-}
-string tokenizePunct ( string str )
-{
- string str_mod = str;
- sregex rex = sregex::compile ( "(([^0-9])([\\,])([^0-9]))" );
- string replace ( "$2 $3 $4" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ return str_mod;
+ }
+ string tokenizePunct ( string str )
+ {
+ string str_mod = str;
+ sregex rex = sregex::compile ( "(([^0-9])([\\,])([^0-9]))" );
+ string replace ( "$2 $3 $4" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(([^0-9])([\\.])([^0-9]))" );
- replace = ( "$2 $3 $4" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(([^0-9])([\\.])([^0-9]))" );
+ replace = ( "$2 $3 $4" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.]) )" );
- replace = ( " $2. " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.]) )" );
+ replace = ( " $2. " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.])$)" );
- replace = ( " $2. " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.])$)" );
+ replace = ( " $2. " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(^([A-Z]|[a-z]) ([\\.]) )" );
- replace = ( " $2. " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(^([A-Z]|[a-z]) ([\\.]) )" );
+ replace = ( " $2. " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(([A-Z]|[a-z])([\\.]) ([A-Z]|[a-z])([\\.]) )" );
- replace = ( "$2.$4. " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(([A-Z]|[a-z])([\\.]) ([A-Z]|[a-z])([\\.]) )" );
+ replace = ( "$2.$4. " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\?]" );
- replace = ( " ? " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\?]" );
+ replace = ( " ? " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\;]" );
- replace = ( " ; " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\;]" );
+ replace = ( " ; " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(([^0-9])([\\:])([^0-9]))" );
- replace = ( "$2 $3 $4" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(([^0-9])([\\:])([^0-9]))" );
+ replace = ( "$2 $3 $4" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\!]" );
- replace = ( " ! " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\!]" );
+ replace = ( " ! " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\(]" );
- replace = ( " ( " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\(]" );
+ replace = ( " ( " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\\)]" );
- replace = ( " ) " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\\)]" );
+ replace = ( " ) " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[\"]" );
- replace = ( " \" " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[\"]" );
+ replace = ( " \" " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(num_ \\( ([^\\)]+) \\))" );
- replace = ( "num_($2)" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(num_ \\( ([^\\)]+) \\))" );
+ replace = ( "num_($2)" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(ordinal_ \\( ([^\\)]*) \\))" );
- replace = ( "ordinal_($2)" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(ordinal_ \\( ([^\\)]*) \\))" );
+ replace = ( "ordinal_($2)" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(^([Mm]) \\.)" );
- replace = ( "$2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(^([Mm]) \\.)" );
+ replace = ( "$2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "( ([Mm]) \\.)" );
- replace = ( " $2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "( ([Mm]) \\.)" );
+ replace = ( " $2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(^([Dd]r) \\.)" );
- replace = ( "$2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(^([Dd]r) \\.)" );
+ replace = ( "$2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "( ([Dd]r) \\.)" );
- replace = ( " $2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "( ([Dd]r) \\.)" );
+ replace = ( " $2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(^([Mm]r) \\.)" );
- replace = ( "$2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(^([Mm]r) \\.)" );
+ replace = ( "$2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "( ([Mm]r) \\.)" );
- replace = ( " $2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "( ([Mm]r) \\.)" );
+ replace = ( " $2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(^([Mm]rs) \\.)" );
- replace = ( "$2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(^([Mm]rs) \\.)" );
+ replace = ( "$2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "( ([Mm]rs) \\.)" );
- replace = ( " $2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "( ([Mm]rs) \\.)" );
+ replace = ( " $2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(^([Nn]o) \\.)" );
- replace = ( "$2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(^([Nn]o) \\.)" );
+ replace = ( "$2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "( ([Nn]o) \\.)" );
- replace = ( " $2." );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "( ([Nn]o) \\.)" );
+ replace = ( " $2." );
+ str_mod = regex_replace ( str_mod, rex, replace );
// rex = sregex::compile ( "(^(([Jj]an)|([Ff]ev)|([Mm]ar)|([Aa]pr)|([Jj]un)|([Jj]ul)|([Aa]ug)|([Ss]ept)|([Oo]ct)|([Nn]ov)|([Dd]ec)) \\.)" );
// replace = ( "$2." );
// str_mod = regex_replace ( str_mod, rex, replace );
-//
+//
// rex = sregex::compile ( "( (([Jj]an)|([Ff]ev)|([Mm]ar)|([Aa]pr)|([Jj]un)|([Jj]ul)|([Aa]ug)|([Ss]ept)|([Oo]ct)|([Nn]ov)|([Dd]ec)) \\.)" );
// replace = ( " $2." );
// str_mod = regex_replace ( str_mod, rex, replace );
-//
+//
// rex = sregex::compile ( "(^(([Gg]en)|([Cc]ol)) \\.)" );
// replace = ( "$2." );
// str_mod = regex_replace ( str_mod, rex, replace );
-//
+//
// rex = sregex::compile ( "( (([Gg]en)|([Cc]ol)) \\.)" );
// replace = ( " $2." );
// str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(^(([A-Z][a-z])) \\. )" );
- replace = ( "$2. " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(^(([A-Z][a-z])) \\. )" );
+ replace = ( "$2. " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "( (([A-Z][a-z])) \\. )" );
- replace = ( " $2. " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "( (([A-Z][a-z])) \\. )" );
+ replace = ( " $2. " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "(^(([A-Z][a-z][a-z])) \\. )" );
- replace = ( "$2. " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "(^(([A-Z][a-z][a-z])) \\. )" );
+ replace = ( "$2. " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "( (([A-Z][a-z][a-z])) \\. )" );
- replace = ( " $2. " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "( (([A-Z][a-z][a-z])) \\. )" );
+ replace = ( " $2. " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[ ]+" );
- replace = " ";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[ ]+" );
+ replace = " ";
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "^[ ]+" );
- replace = "";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "^[ ]+" );
+ replace = "";
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "[ ]+$" );
- replace = "";
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "[ ]+$" );
+ replace = "";
+ str_mod = regex_replace ( str_mod, rex, replace );
- return str_mod;
-}
-
-string normalizeStd ( string str )
-{
- string str_mod = str;
- sregex rex = sregex::compile ( "(<skipped>)" );
- string replace ( "" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ return str_mod;
+ }
- rex = sregex::compile ( "-\n" );
- replace = ( "" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ string normalizeStd ( string str )
+ {
+ string str_mod = str;
+ sregex rex = sregex::compile ( "(<skipped>)" );
+ string replace ( "" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "\n" );
- replace = ( " " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "-\n" );
+ replace = ( "" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "&quot;" );
- replace = ( "\"" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "\n" );
+ replace = ( " " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "&amp;" );
- replace = ( "& " );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "&quot;" );
+ replace = ( "\"" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "&lt;" );
- replace = ( "<" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "&amp;" );
+ replace = ( "& " );
+ str_mod = regex_replace ( str_mod, rex, replace );
- rex = sregex::compile ( "&gt;" );
- replace = ( ">" );
- str_mod = regex_replace ( str_mod, rex, replace );
+ rex = sregex::compile ( "&lt;" );
+ replace = ( "<" );
+ str_mod = regex_replace ( str_mod, rex, replace );
- return str_mod;
-}
-*/
+ rex = sregex::compile ( "&gt;" );
+ replace = ( ">" );
+ str_mod = regex_replace ( str_mod, rex, replace );
-param copyParam ( param p )
-{
- param to_return;
- to_return.caseOn = p.caseOn;
- to_return.noPunct = p.noPunct;
- to_return.debugMode = p.debugMode;
- to_return.hypothesisFile = p.hypothesisFile;
- to_return.referenceFile = p.referenceFile;
- to_return.normalize = p.normalize;
- to_return.noTxtIds = p.noTxtIds;
- to_return.outputFileExtension = p.outputFileExtension;
- to_return.outputFileName = p.outputFileName;
- to_return.sgmlInputs = p.sgmlInputs;
- to_return.tercomLike = p.tercomLike;
- return to_return;
-}
-string printParams ( param p )
-{
- stringstream s;
- s << "caseOn = " << p.caseOn << endl;
- s << "noPunct = " << p.noPunct << endl;
- s << "debugMode = " << p.debugMode << endl;
- s << "hypothesisFile = " << p.hypothesisFile << endl;
- s << "referenceFile = " << p.referenceFile << endl;
- s << "normalize = " << p.normalize << endl;
- s << "noTxtIds = " << p.noTxtIds << endl;
- s << "outputFileExtension = " << p.outputFileExtension << endl;
- s << "outputFileName = " << p.outputFileName << endl;
- s << "sgmlInputs = " << p.sgmlInputs << endl;
- s << "tercomLike = " << p.tercomLike << endl;
- return s.str();
+ return str_mod;
+ }
-}
+ param copyParam ( param p )
+ {
+ param to_return;
+ to_return.caseOn = p.caseOn;
+ to_return.noPunct = p.noPunct;
+ to_return.debugMode = p.debugMode;
+ to_return.debugLevel = p.debugLevel;
+ to_return.hypothesisFile = p.hypothesisFile;
+ to_return.referenceFile = p.referenceFile;
+ to_return.normalize = p.normalize;
+ to_return.noTxtIds = p.noTxtIds;
+ to_return.outputFileExtension = p.outputFileExtension;
+ to_return.outputFileName = p.outputFileName;
+ to_return.sgmlInputs = p.sgmlInputs;
+ to_return.tercomLike = p.tercomLike;
+ to_return.printAlignments = p.printAlignments;
+ to_return.WER=p.WER;
+ return to_return;
+ }
+ string printParams ( param p )
+ {
+ stringstream s;
+ s << "caseOn = " << p.caseOn << endl;
+ s << "noPunct = " << p.noPunct << endl;
+ s << "debugMode = " << p.debugMode << endl;
+ s << "debugLevel = " << p.debugLevel << endl;
+ s << "hypothesisFile = " << p.hypothesisFile << endl;
+ s << "referenceFile = " << p.referenceFile << endl;
+ s << "normalize = " << p.normalize << endl;
+ s << "noTxtIds = " << p.noTxtIds << endl;
+ s << "outputFileExtension = " << p.outputFileExtension << endl;
+ s << "outputFileName = " << p.outputFileName << endl;
+ s << "sgmlInputs = " << p.sgmlInputs << endl;
+ s << "tercomLike = " << p.tercomLike << endl;
+ return s.str();
+ }
+ string join ( string delim, vector<string> arr )
+ {
+ if ( ( int ) arr.size() == 0 ) return "";
+// if ((int)delim.compare("") == 0) delim = new String("");
+// String s = new String("");
+ stringstream s;
+ s.str ( "" );
+ for ( int i = 0; i < ( int ) arr.size(); i++ )
+ {
+ if ( i == 0 )
+ {
+ s << arr.at ( i );
+ }
+ else
+ {
+ s << delim << arr.at ( i );
+ }
+ }
+ return s.str();
+// return "";
+ }
}
diff --git a/mert/TER/tools.h b/mert/TER/tools.h
index 6f78b9a6a..0a85e7b4b 100644
--- a/mert/TER/tools.h
+++ b/mert/TER/tools.h
@@ -1,38 +1,66 @@
+/*********************************
+tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
+
+Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
+Contact: christophe.servan@lium.univ-lemans.fr
+
+The tercpp tool and library are free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the licence, or
+(at your option) any later version.
+
+This program and library are distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**********************************/
#ifndef MERT_TER_TOOLS_H_
#define MERT_TER_TOOLS_H_
+
#include <vector>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <sstream>
+#include <boost/xpressive/xpressive.hpp>
+
using namespace std;
namespace Tools
{
-typedef vector<double> vecDouble;
-typedef vector<char> vecChar;
-typedef vector<int> vecInt;
-typedef vector<float> vecFloat;
-typedef vector<string> vecString;
-typedef vector<string> alignmentElement;
-typedef vector<alignmentElement> WERalignment;
+ typedef vector<double> vecDouble;
+ typedef vector<char> vecChar;
+ typedef vector<int> vecInt;
+ typedef vector<float> vecFloat;
+ typedef vector<size_t> vecSize_t;
+ typedef vector<string> vecString;
+ typedef vector<string> alignmentElement;
+ typedef vector<alignmentElement> WERalignment;
-struct param {
- bool debugMode;
- string referenceFile; // path to the resources
- string hypothesisFile; // path to the configuration files
- string outputFileExtension;
- string outputFileName;
- bool noPunct;
- bool caseOn;
- bool normalize;
- bool tercomLike;
- bool sgmlInputs;
- bool noTxtIds;
+struct param
+{
+ bool debugMode;
+ string referenceFile; // path to the resources
+ string hypothesisFile; // path to the configuration files
+ string outputFileExtension;
+ string outputFileName;
+ bool noPunct;
+ bool caseOn;
+ bool normalize;
+ bool tercomLike;
+ bool sgmlInputs;
+ bool noTxtIds;
+ bool printAlignments;
+ bool WER;
+ int debugLevel;
};
// param = { false, "","","","" };
@@ -40,26 +68,35 @@ struct param {
// private:
// public:
-string vectorToString ( vector<string> vec );
-string vectorToString ( vector<string> vec, string s );
-vector<string> subVector ( vector<string> vec, int start, int end );
-vector<int> subVector ( vector<int> vec, int start, int end );
-vector<float> subVector ( vector<float> vec, int start, int end );
-vector<string> copyVector ( vector<string> vec );
-vector<int> copyVector ( vector<int> vec );
-vector<float> copyVector ( vector<float> vec );
-vector<string> stringToVector ( string s, string tok );
-vector<int> stringToVectorInt ( string s, string tok );
-vector<float> stringToVectorFloat ( string s, string tok );
-string lowerCase(string str);
-string removePunct(string str);
-string tokenizePunct(string str);
-string removePunctTercom(string str);
-string normalizeStd(string str);
-string printParams(param p);
+ string vectorToString ( vector<string> vec );
+ string vectorToString ( vector<char> vec );
+ string vectorToString ( vector<int> vec );
+ string vectorToString ( vector<string> vec, string s );
+ string vectorToString ( vector<char> vec, string s );
+ string vectorToString ( vector<int> vec, string s );
+ string vectorToString ( vector<bool> vec, string s );
+ string vectorToString ( char* vec, string s, int taille );
+ string vectorToString ( int* vec, string s , int taille );
+ string vectorToString ( bool* vec, string s , int taille );
+ vector<string> subVector ( vector<string> vec, int start, int end );
+ vector<int> subVector ( vector<int> vec, int start, int end );
+ vector<float> subVector ( vector<float> vec, int start, int end );
+ vector<string> copyVector ( vector<string> vec );
+ vector<int> copyVector ( vector<int> vec );
+ vector<float> copyVector ( vector<float> vec );
+ vector<string> stringToVector ( string s, string tok );
+ vector<string> stringToVector ( char s, string tok );
+ vector<string> stringToVector ( int s, string tok );
+ vector<int> stringToVectorInt ( string s, string tok );
+ vector<float> stringToVectorFloat ( string s, string tok );
+ string lowerCase(string str);
+ string removePunct(string str);
+ string tokenizePunct(string str);
+ string removePunctTercom(string str);
+ string normalizeStd(string str);
+ string printParams(param p);
+ string join ( string delim, vector<string> arr );
// };
-param copyParam(param p);
-
+ param copyParam(param p);
}
-
-#endif // MERT_TER_TOOLS_H_
+#endif
diff --git a/mert/TerScorer.cpp b/mert/TerScorer.cpp
index 7c11ea66b..fc40fdc82 100644
--- a/mert/TerScorer.cpp
+++ b/mert/TerScorer.cpp
@@ -101,7 +101,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
entry.set ( stats_str );
}
-float TerScorer::calculateScore(const vector<int>& comps) const
+float TerScorer::calculateScore(const vector<ScoreStatsType>& comps) const
{
float denom = 1.0 * comps[1];
float num = -1.0 * comps[0];
diff --git a/mert/TerScorer.h b/mert/TerScorer.h
index 5e9fed46f..f84e86006 100644
--- a/mert/TerScorer.h
+++ b/mert/TerScorer.h
@@ -31,7 +31,7 @@ public:
return kLENGTH + 1;
}
- virtual float calculateScore(const std::vector<int>& comps) const;
+ virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const;
private:
const int kLENGTH;
diff --git a/mert/Types.h b/mert/Types.h
index 0ea9614d9..10f00de16 100644
--- a/mert/Types.h
+++ b/mert/Types.h
@@ -33,7 +33,7 @@ typedef FeatureStatsType* featstats_t;
typedef std::vector<FeatureStats> featarray_t;
typedef std::vector<FeatureArray> featdata_t;
-typedef int ScoreStatsType;
+typedef float ScoreStatsType;
typedef ScoreStatsType* scorestats_t;
//typedef std::vector<ScoreStatsType> scorestats_t;
typedef std::vector<ScoreStats> scorearray_t;
diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp
index eb95c9018..caae07684 100644
--- a/mert/evaluator.cpp
+++ b/mert/evaluator.cpp
@@ -14,6 +14,7 @@
#include "ScorerFactory.h"
#include "Timer.h"
#include "Util.h"
+#include "Data.h"
using namespace std;
using namespace MosesTuning;
@@ -30,17 +31,20 @@ const float g_alpha = 0.05;
class EvaluatorUtil
{
public:
- static void evaluate(const string& candFile, int bootstrap);
+ static void evaluate(const string& candFile, int bootstrap, bool nbest_mode);
static float average(const vector<float>& list);
static string int2string(int n);
+ static vector<ScoreStats> loadNBest(const string& nBestFile);
+ static vector<ScoreStats> loadCand(const string& candFile);
private:
EvaluatorUtil() {}
~EvaluatorUtil() {}
};
-void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
-{
+// load hypothesis from candidate output
+vector<ScoreStats> EvaluatorUtil::loadCand(const string& candFile) {
+
ifstream cand(candFile.c_str());
if (!cand.good()) throw runtime_error("Error opening candidate file");
@@ -53,6 +57,34 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
g_scorer->prepareStats(entries.size(), line, scoreentry);
entries.push_back(scoreentry);
}
+ return entries;
+}
+
+// load 1-best hypothesis from n-best file (useful if relying on alignment/tree information)
+vector<ScoreStats> EvaluatorUtil::loadNBest(const string& nBestFile) {
+ vector<ScoreStats> entries;
+
+ Data data(g_scorer);
+ data.loadNBest(nBestFile, true);
+ const ScoreDataHandle & score_data = data.getScoreData();
+ for (size_t i = 0; i != score_data->size(); i++) {
+ entries.push_back(score_data->get(i, 0));
+ }
+ return entries;
+}
+
+
+void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_input)
+{
+
+ vector<ScoreStats> entries;
+
+ if (nbest_input) {
+ entries = loadNBest(candFile);
+ }
+ else {
+ entries = loadCand(candFile);
+ }
int n = entries.size();
if (bootstrap) {
@@ -131,6 +163,7 @@ void usage()
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
cerr << "[--reference|-R] comma separated list of reference files" << endl;
cerr << "[--candidate|-C] comma separated list of candidate files" << endl;
+ cerr << "[--nbest|-n] comma separated list of nbest files (only 1-best is evaluated)" << endl;
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
cerr << "[--filter|-l] filter command which will be used to preprocess the sentences" << endl;
cerr << "[--bootstrap|-b] number of booststraped samples (default 0 - no bootstraping)" << endl;
@@ -162,6 +195,7 @@ static struct option long_options[] = {
{"scconfig", required_argument, 0, 'c'},
{"reference", required_argument, 0, 'R'},
{"candidate", required_argument, 0, 'C'},
+ {"nbest", required_argument, 0, 'n'},
{"bootstrap", required_argument, 0, 'b'},
{"rseed", required_argument, 0, 'r'},
{"factors", required_argument, 0, 'f'},
@@ -176,6 +210,7 @@ struct ProgramOption {
vector<string> scorer_configs;
string reference;
string candidate;
+ string nbest;
vector<string> scorer_factors;
vector<string> scorer_filter;
int bootstrap;
@@ -185,6 +220,7 @@ struct ProgramOption {
ProgramOption()
: reference(""),
candidate(""),
+ nbest(""),
bootstrap(0),
seed(0),
has_seed(false) { }
@@ -195,7 +231,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
int c;
int option_index;
int last_scorer_index = -1;
- while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:l:h", long_options, &option_index)) != -1) {
+ while ((c = getopt_long(argc, argv, "s:c:R:C:n:b:r:f:l:h", long_options, &option_index)) != -1) {
switch(c) {
case 's':
opt->scorer_types.push_back(string(optarg));
@@ -205,6 +241,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
last_scorer_index++;
break;
case 'c':
+ if (last_scorer_index == -1) throw runtime_error("You need to specify a scorer before its config string.");
opt->scorer_configs[last_scorer_index] = string(optarg);
break;
case 'R':
@@ -213,6 +250,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
case 'C':
opt->candidate = string(optarg);
break;
+ case 'n':
+ opt->nbest = string(optarg);
+ break;
case 'b':
opt->bootstrap = atoi(optarg);
break;
@@ -221,9 +261,11 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
opt->has_seed = true;
break;
case 'f':
+ if (last_scorer_index == -1) throw runtime_error("You need to specify a scorer before its list of factors.");
opt->scorer_factors[last_scorer_index] = string(optarg);
break;
case 'l':
+ if (last_scorer_index == -1) throw runtime_error("You need to specify a scorer before its filter.");
opt->scorer_filter[last_scorer_index] = string(optarg);
break;
default:
@@ -271,8 +313,13 @@ int main(int argc, char** argv)
if (option.reference.length() == 0) throw runtime_error("You have to specify at least one reference file.");
split(option.reference, ',', refFiles);
- if (option.candidate.length() == 0) throw runtime_error("You have to specify at least one candidate file.");
- split(option.candidate, ',', candFiles);
+ if (option.candidate.length() == 0 && option.nbest.length() == 0) throw runtime_error("You have to specify at least one candidate (or n-best) file.");
+ if (option.candidate.length() > 0 && option.nbest.length() > 0) throw runtime_error("You can either specify candidate files or n-best files, but not both.");
+ bool nbest_input = option.nbest.length() > 0;
+ if (nbest_input)
+ split(option.nbest, ',', candFiles);
+ else
+ split(option.candidate, ',', candFiles);
if (candFiles.size() > 1) g_has_more_files = true;
if (option.scorer_types.size() > 1) g_has_more_scorers = true;
@@ -283,7 +330,7 @@ int main(int argc, char** argv)
g_scorer->setFactors(option.scorer_factors[i]);
g_scorer->setFilter(option.scorer_filter[i]);
g_scorer->setReferenceFiles(refFiles);
- EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
+ EvaluatorUtil::evaluate(*fileIt, option.bootstrap, nbest_input);
delete g_scorer;
}
}
diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp
index a2665ac13..0abce8af4 100644
--- a/mert/kbmira.cpp
+++ b/mert/kbmira.cpp
@@ -39,77 +39,71 @@ de recherches du Canada
#include <boost/program_options.hpp>
#include <boost/scoped_ptr.hpp>
+#include "util/exception.hh"
+
#include "BleuScorer.h"
-#include "HypPackEnumerator.h"
+#include "HopeFearDecoder.h"
#include "MiraFeatureVector.h"
#include "MiraWeightVector.h"
+#include "Scorer.h"
+#include "ScorerFactory.h"
+
using namespace std;
using namespace MosesTuning;
namespace po = boost::program_options;
-ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv)
-{
- vector<ValType> stats(kBleuNgramOrder*2+1,0);
- for(train->reset(); !train->finished(); train->next()) {
- // Find max model
- size_t max_index=0;
- ValType max_score=0;
- for(size_t i=0; i<train->cur_size(); i++) {
- MiraFeatureVector vec(train->featuresAt(i));
- ValType score = wv.score(vec);
- if(i==0 || score > max_score) {
- max_index = i;
- max_score = score;
- }
- }
- // Update stats
- const vector<float>& sent = train->scoresAt(max_index);
- for(size_t i=0; i<sent.size(); i++) {
- stats[i]+=sent[i];
- }
- }
- return unsmoothedBleu(stats);
-}
-
int main(int argc, char** argv)
{
- const ValType BLEU_RATIO = 5;
bool help;
string denseInitFile;
string sparseInitFile;
+ string type = "nbest";
+ string sctype = "BLEU";
+ string scconfig = "";
vector<string> scoreFiles;
vector<string> featureFiles;
+ vector<string> referenceFiles; //for hg mira
+ string hgDir;
int seed;
string outputFile;
float c = 0.01; // Step-size cap C
float decay = 0.999; // Pseudo-corpus decay \gamma
int n_iters = 60; // Max epochs J
bool streaming = false; // Stream all k-best lists?
+ bool streaming_out = false; // Stream output after each sentence?
bool no_shuffle = false; // Don't shuffle, even for in memory version
bool model_bg = false; // Use model for background corpus
bool verbose = false; // Verbose updates
bool safe_hope = false; // Model score cannot have more than BLEU_RATIO times more influence than BLEU
+ size_t hgPruning = 50; //prune hypergraphs to have this many edges per reference word
// Command-line processing follows pro.cpp
po::options_description desc("Allowed options");
desc.add_options()
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
+ ("type,t", po::value<string>(&type), "Either nbest or hypergraph")
+ ("sctype", po::value<string>(&sctype), "the scorer type (default BLEU)")
+ ("scconfig,c", po::value<string>(&scconfig), "configuration string passed to scorer")
("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
+ ("hgdir,H", po::value<string> (&hgDir), "Directory containing hypergraphs")
+ ("reference,R", po::value<vector<string> > (&referenceFiles), "Reference files, only required for hypergraph mira")
("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
("output-file,o", po::value<string>(&outputFile), "Output file")
("cparam,C", po::value<float>(&c), "MIRA C-parameter, lower for more regularization (default 0.01)")
("decay,D", po::value<float>(&decay), "BLEU background corpus decay rate (default 0.999)")
("iters,J", po::value<int>(&n_iters), "Number of MIRA iterations to run (default 60)")
- ("dense-init,d", po::value<string>(&denseInitFile), "Weight file for dense features")
+ ("dense-init,d", po::value<string>(&denseInitFile), "Weight file for dense features. This should have 'name= value' on each line, or (legacy) should be the Moses mert 'init.opt' format.")
("sparse-init,s", po::value<string>(&sparseInitFile), "Weight file for sparse features")
("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle")
+ ("streaming-out", po::value(&streaming_out)->zero_tokens()->default_value(false), "Stream weights to stdout after each sentence")
("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch")
("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background")
("verbose", po::value(&verbose)->zero_tokens()->default_value(false), "Verbose updates")
("safe-hope", po::value(&safe_hope)->zero_tokens()->default_value(false), "Mode score's influence on hope decoding is limited")
+ ("hg-prune", po::value<size_t>(&hgPruning), "Prune hypergraphs to have this many edges per reference word")
;
po::options_description cmdline_options;
@@ -145,12 +139,56 @@ int main(int argc, char** argv)
cerr << "could not open dense initfile: " << denseInitFile << endl;
exit(3);
}
+ if (verbose) cerr << "Reading dense features:" << endl;
parameter_t val;
getline(opt,buffer);
- istringstream strstrm(buffer);
- while(strstrm >> val) {
- initParams.push_back(val);
+ if (buffer.find_first_of("=") == buffer.npos) {
+ UTIL_THROW_IF(type == "hypergraph", util::Exception, "For hypergraph version, require dense features in 'name= value' format");
+ cerr << "WARN: dense features in deprecated Moses mert format. Prefer 'name= value' format." << endl;
+ istringstream strstrm(buffer);
+ while(strstrm >> val) {
+ initParams.push_back(val);
+ if(verbose) cerr << val << endl;
+ }
+ } else {
+ vector<string> names;
+ string last_name = "";
+ size_t feature_ctr = 1;
+ do {
+ size_t equals = buffer.find_last_of("=");
+ UTIL_THROW_IF(equals == buffer.npos, util::Exception, "Incorrect format in dense feature file: '"
+ << buffer << "'");
+ string name = buffer.substr(0,equals);
+ names.push_back(name);
+ initParams.push_back(boost::lexical_cast<ValType>(buffer.substr(equals+2)));
+
+ //Names for features with several values need to have their id added
+ if (name != last_name) feature_ctr = 1;
+ last_name = name;
+ if (feature_ctr>1) {
+ stringstream namestr;
+ namestr << names.back() << "_" << feature_ctr;
+ names[names.size()-1] = namestr.str();
+ if (feature_ctr == 2) {
+ stringstream namestr;
+ namestr << names[names.size()-2] << "_" << (feature_ctr-1);
+ names[names.size()-2] = namestr.str();
+ }
+ }
+ ++feature_ctr;
+
+ } while(getline(opt,buffer));
+
+
+ //Make sure that SparseVector encodes dense feature names as 0..n-1.
+ for (size_t i = 0; i < names.size(); ++i) {
+ size_t id = SparseVector::encode(names[i]);
+ assert(id == i);
+ if (verbose) cerr << names[i] << " " << initParams[i] << endl;
+ }
+
}
+
opt.close();
}
size_t initDenseSize = initParams.size();
@@ -180,90 +218,55 @@ int main(int argc, char** argv)
MiraWeightVector wv(initParams);
+ // Initialize scorer
+ if(sctype != "BLEU" && type == "hypergraph") {
+ UTIL_THROW(util::Exception, "hypergraph mira only supports BLEU");
+ }
+ boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer(sctype, scconfig));
+
// Initialize background corpus
- vector<ValType> bg;
- for(int j=0; j<kBleuNgramOrder; j++) {
- bg.push_back(kBleuNgramOrder-j);
- bg.push_back(kBleuNgramOrder-j);
+ vector<ValType> bg(scorer->NumberOfScores(), 1);
+
+ boost::scoped_ptr<HopeFearDecoder> decoder;
+ if (type == "nbest") {
+ decoder.reset(new NbestHopeFearDecoder(featureFiles, scoreFiles, streaming, no_shuffle, safe_hope, scorer.get()));
+ } else if (type == "hypergraph") {
+ decoder.reset(new HypergraphHopeFearDecoder(hgDir, referenceFiles, initDenseSize, streaming, no_shuffle, safe_hope, hgPruning, wv, scorer.get()));
+ } else {
+ UTIL_THROW(util::Exception, "Unknown batch mira type: '" << type << "'");
}
- bg.push_back(kBleuNgramOrder);
// Training loop
- boost::scoped_ptr<HypPackEnumerator> train;
- if(streaming)
- train.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles));
- else
- train.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle));
- cerr << "Initial BLEU = " << evaluate(train.get(), wv.avg()) << endl;
+ if (!streaming_out)
+ cerr << "Initial BLEU = " << decoder->Evaluate(wv.avg()) << endl;
ValType bestBleu = 0;
for(int j=0; j<n_iters; j++) {
// MIRA train for one epoch
- int iNumHyps = 0;
int iNumExamples = 0;
int iNumUpdates = 0;
ValType totalLoss = 0.0;
- for(train->reset(); !train->finished(); train->next()) {
- // Hope / fear decode
- ValType hope_scale = 1.0;
- size_t hope_index=0, fear_index=0, model_index=0;
- ValType hope_score=0, fear_score=0, model_score=0;
- int iNumHypsBackup = iNumHyps;
- for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
- iNumHyps = iNumHypsBackup;
- ValType hope_bleu, hope_model;
- for(size_t i=0; i< train->cur_size(); i++) {
- const MiraFeatureVector& vec=train->featuresAt(i);
- ValType score = wv.score(vec);
- ValType bleu = sentenceLevelBackgroundBleu(train->scoresAt(i),bg);
- // Hope
- if(i==0 || (hope_scale*score + bleu) > hope_score) {
- hope_score = hope_scale*score + bleu;
- hope_index = i;
- hope_bleu = bleu;
- hope_model = score;
- }
- // Fear
- if(i==0 || (score - bleu) > fear_score) {
- fear_score = score - bleu;
- fear_index = i;
- }
- // Model
- if(i==0 || score > model_score) {
- model_score = score;
- model_index = i;
- }
- iNumHyps++;
- }
- // Outer loop rescales the contribution of model score to 'hope' in antagonistic cases
- // where model score is having far more influence than BLEU
- hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU
- if(safe_hope && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale)
- hope_scale = abs(hope_bleu) / abs(hope_model);
- else break;
- }
+ size_t sentenceIndex = 0;
+ for(decoder->reset();!decoder->finished(); decoder->next()) {
+ HopeFearData hfd;
+ decoder->HopeFear(bg,wv,&hfd);
+
// Update weights
- if(hope_index!=fear_index) {
+ if (!hfd.hopeFearEqual && hfd.hopeBleu > hfd.fearBleu) {
// Vector difference
- const MiraFeatureVector& hope=train->featuresAt(hope_index);
- const MiraFeatureVector& fear=train->featuresAt(fear_index);
- MiraFeatureVector diff = hope - fear;
+ MiraFeatureVector diff = hfd.hopeFeatures - hfd.fearFeatures;
// Bleu difference
- const vector<float>& hope_stats = train->scoresAt(hope_index);
- ValType hopeBleu = sentenceLevelBackgroundBleu(hope_stats, bg);
- const vector<float>& fear_stats = train->scoresAt(fear_index);
- ValType fearBleu = sentenceLevelBackgroundBleu(fear_stats, bg);
- assert(hopeBleu + 1e-8 >= fearBleu);
- ValType delta = hopeBleu - fearBleu;
+ //assert(hfd.hopeBleu + 1e-8 >= hfd.fearBleu);
+ ValType delta = hfd.hopeBleu - hfd.fearBleu;
// Loss and update
ValType diff_score = wv.score(diff);
ValType loss = delta - diff_score;
if(verbose) {
- cerr << "Updating sent " << train->cur_id() << endl;
+ cerr << "Updating sent " << sentenceIndex << endl;
cerr << "Wght: " << wv << endl;
- cerr << "Hope: " << hope << " BLEU:" << hopeBleu << " Score:" << wv.score(hope) << endl;
- cerr << "Fear: " << fear << " BLEU:" << fearBleu << " Score:" << wv.score(fear) << endl;
+ cerr << "Hope: " << hfd.hopeFeatures << " BLEU:" << hfd.hopeBleu << " Score:" << wv.score(hfd.hopeFeatures) << endl;
+ cerr << "Fear: " << hfd.fearFeatures << " BLEU:" << hfd.fearBleu << " Score:" << wv.score(hfd.fearFeatures) << endl;
cerr << "Diff: " << diff << " BLEU:" << delta << " Score:" << diff_score << endl;
- cerr << "Loss: " << loss << " Scale: " << hope_scale << endl;
+ cerr << "Loss: " << loss << " Scale: " << 1 << endl;
cerr << endl;
}
if(loss > 0) {
@@ -273,16 +276,18 @@ int main(int argc, char** argv)
iNumUpdates++;
}
// Update BLEU statistics
- const vector<float>& model_stats = train->scoresAt(model_index);
for(size_t k=0; k<bg.size(); k++) {
bg[k]*=decay;
if(model_bg)
- bg[k]+=model_stats[k];
+ bg[k]+=hfd.modelStats[k];
else
- bg[k]+=hope_stats[k];
+ bg[k]+=hfd.hopeStats[k];
}
}
iNumExamples++;
+ ++sentenceIndex;
+ if (streaming_out)
+ cout << wv << endl;
}
// Training Epoch summary
cerr << iNumUpdates << "/" << iNumExamples << " updates"
@@ -291,15 +296,16 @@ int main(int argc, char** argv)
// Evaluate current average weights
AvgWeightVector avg = wv.avg();
- ValType bleu = evaluate(train.get(), avg);
+ ValType bleu = decoder->Evaluate(avg);
cerr << ", BLEU = " << bleu << endl;
if(bleu > bestBleu) {
+ /*
size_t num_dense = train->num_dense();
if(initDenseSize>0 && initDenseSize!=num_dense) {
cerr << "Error: Initial dense feature count and dense feature count from n-best do not match: "
<< initDenseSize << "!=" << num_dense << endl;
exit(1);
- }
+ }*/
// Write to a file
ostream* out;
ofstream outFile;
@@ -314,11 +320,11 @@ int main(int argc, char** argv)
out = &cout;
}
for(size_t i=0; i<avg.size(); i++) {
- if(i<num_dense)
+ if(i<initDenseSize)
*out << "F" << i << " " << avg.weight(i) << endl;
else {
if(abs(avg.weight(i))>1e-8)
- *out << SparseVector::decode(i-num_dense) << " " << avg.weight(i) << endl;
+ *out << SparseVector::decode(i-initDenseSize) << " " << avg.weight(i) << endl;
}
}
outFile.close();
diff --git a/mira/Decoder.cpp b/mira/Decoder.cpp
index a4d6fdb64..64b6a2e5f 100644
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@@ -143,8 +143,8 @@ vector< vector<const Word*> > MosesDecoder::runDecoder(const std::string& source
string filename)
{
// run the decoder
- m_manager = new Moses::Manager(0,*m_sentence, search);
- m_manager->ProcessSentence();
+ m_manager = new Moses::Manager(*m_sentence, search);
+ m_manager->Decode();
TrellisPathList nBestList;
m_manager->CalcNBest(nBestSize, nBestList, distinct);
@@ -221,7 +221,7 @@ vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& s
{
// run the decoder
m_chartManager = new ChartManager(*m_sentence);
- m_chartManager->ProcessSentence();
+ m_chartManager->Decode();
ChartKBestExtractor::KBestVec nBestList;
m_chartManager->CalcNBest(nBestSize, nBestList, distinct);
diff --git a/mira/Jamfile b/mira/Jamfile
index 3862cb172..e43a993b5 100644
--- a/mira/Jamfile
+++ b/mira/Jamfile
@@ -2,7 +2,7 @@ lib mira_lib :
[ glob *.cpp : *Test.cpp Main.cpp ]
../mert//mert_lib ../moses//moses ../OnDiskPt//OnDiskPt ..//boost_program_options ;
-exe mira : Main.cpp mira_lib ../mert//mert_lib ../moses//moses ../OnDiskPt//OnDiskPt ..//boost_program_options ;
+exe mira : Main.cpp mira_lib ../mert//mert_lib ../moses//moses ../OnDiskPt//OnDiskPt ..//boost_program_options ..//boost_filesystem ;
alias programs : mira ;
diff --git a/mira/Main.cpp b/mira/Main.cpp
index 13f71bc2c..abf92b598 100644
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@@ -665,7 +665,7 @@ int main(int argc, char** argv)
}
// number of weight dumps this epoch
- // size_t weightMixingThisEpoch = 0;
+ size_t weightMixingThisEpoch = 0;
size_t weightEpochDump = 0;
size_t shardPosition = 0;
diff --git a/misc/CreateProbingPT.cpp b/misc/CreateProbingPT.cpp
new file mode 100644
index 000000000..3ea369a96
--- /dev/null
+++ b/misc/CreateProbingPT.cpp
@@ -0,0 +1,20 @@
+#include "util/usage.hh"
+#include "moses/TranslationModel/ProbingPT/storing.hh"
+
+
+
+int main(int argc, char* argv[]){
+
+ if (argc != 3) {
+ // Tell the user how to run the program
+ std::cerr << "Provided " << argc << " arguments, needed 3." << std::endl;
+ std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir" << std::endl;
+ return 1;
+ }
+
+ createProbingPT(argv[1], argv[2]);
+
+ util::PrintUsage(std::cout);
+ return 0;
+}
+
diff --git a/misc/Jamfile b/misc/Jamfile
index 76f91babb..8cc7aa9a8 100644
--- a/misc/Jamfile
+++ b/misc/Jamfile
@@ -1,22 +1,24 @@
-exe processPhraseTable : GenerateTuples.cpp processPhraseTable.cpp ../moses//moses ;
+exe processPhraseTable : GenerateTuples.cpp processPhraseTable.cpp ..//boost_filesystem ../moses//moses ;
-exe processLexicalTable : processLexicalTable.cpp ../moses//moses ;
+exe processLexicalTable : processLexicalTable.cpp ..//boost_filesystem ../moses//moses ;
-exe queryPhraseTable : queryPhraseTable.cpp ../moses//moses ;
+exe queryPhraseTable : queryPhraseTable.cpp ..//boost_filesystem ../moses//moses ;
-exe queryLexicalTable : queryLexicalTable.cpp ../moses//moses ;
+exe queryLexicalTable : queryLexicalTable.cpp ..//boost_filesystem ../moses//moses ;
-exe generateSequences : GenerateSequences.cpp ../moses//moses ;
+exe generateSequences : GenerateSequences.cpp ..//boost_filesystem ../moses//moses ;
-exe TMining : TransliterationMining.cpp ../moses//moses ;
+exe TMining : TransliterationMining.cpp ..//boost_filesystem ../moses//moses ;
-exe 1-1-Extraction : 1-1-Extraction.cpp ../moses//moses ;
+exe 1-1-Extraction : 1-1-Extraction.cpp ..//boost_filesystem ../moses//moses ;
+
+exe prunePhraseTable : prunePhraseTable.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ;
local with-cmph = [ option.get "with-cmph" ] ;
if $(with-cmph) {
- exe processPhraseTableMin : processPhraseTableMin.cpp ../moses//moses ;
- exe processLexicalTableMin : processLexicalTableMin.cpp ../moses//moses ;
- exe queryPhraseTableMin : queryPhraseTableMin.cpp ../moses//moses ;
+ exe processPhraseTableMin : processPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ;
+ exe processLexicalTableMin : processLexicalTableMin.cpp ..//boost_filesystem ../moses//moses ;
+ exe queryPhraseTableMin : queryPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ;
alias programsMin : processPhraseTableMin processLexicalTableMin queryPhraseTableMin ;
# alias programsMin : processPhraseTableMin processLexicalTableMin ;
@@ -25,4 +27,23 @@ else {
alias programsMin ;
}
-alias programs : 1-1-Extraction TMining generateSequences processPhraseTable processLexicalTable queryPhraseTable queryLexicalTable programsMin ;
+if [ option.get "with-probing-pt" : : "yes" ]
+{
+ exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ;
+ exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
+
+ alias programsProbing : CreateProbingPT QueryProbingPT ;
+}
+else {
+ alias programsProbing ;
+}
+
+exe merge-sorted :
+merge-sorted.cc
+../moses//moses
+../moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_program_options
+;
+
+alias programs : 1-1-Extraction TMining generateSequences processPhraseTable processLexicalTable queryPhraseTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable ;
diff --git a/misc/QueryProbingPT.cpp b/misc/QueryProbingPT.cpp
new file mode 100644
index 000000000..8a3441a0d
--- /dev/null
+++ b/misc/QueryProbingPT.cpp
@@ -0,0 +1,61 @@
+#include "util/file_piece.hh"
+
+#include "util/file.hh"
+#include "util/scoped.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/murmur_hash.hh"
+#include "util/probing_hash_table.hh"
+#include "util/usage.hh"
+
+#include "moses/TranslationModel/ProbingPT/quering.hh"
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <sys/mman.h>
+#include <sys/stat.h> //For finding size of file
+#include <boost/functional/hash.hpp>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+int main(int argc, char* argv[]) {
+ if (argc != 2) {
+ // Tell the user how to run the program
+ std::cerr << "Usage: " << argv[0] << " path_to_directory" << std::endl;
+ return 1;
+ }
+
+ QueryEngine queries(argv[1]);
+
+ //Interactive search
+ std::cout << "Please enter a string to be searched, or exit to exit." << std::endl;
+ while (true){
+ std::string cinstr = "";
+ getline(std::cin, cinstr);
+ if (cinstr == "exit"){
+ break;
+ }else{
+ //Actual lookup
+ std::pair<bool, std::vector<target_text> > query_result;
+ query_result = queries.query(StringPiece(cinstr));
+
+ if (query_result.first) {
+ queries.printTargetInfo(query_result.second);
+ } else {
+ std::cout << "Key not found!" << std::endl;
+ }
+ }
+ }
+
+ util::PrintUsage(std::cout);
+
+ return 0;
+}
diff --git a/contrib/m4m/util/merge-sorted.cc b/misc/merge-sorted.cc
index ae693215b..ae693215b 100644
--- a/contrib/m4m/util/merge-sorted.cc
+++ b/misc/merge-sorted.cc
diff --git a/misc/prunePhraseTable.cpp b/misc/prunePhraseTable.cpp
new file mode 100644
index 000000000..dcf8d73da
--- /dev/null
+++ b/misc/prunePhraseTable.cpp
@@ -0,0 +1,227 @@
+// $Id$
+// vim:tabstop=2
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+
+/**
+ Prune the phrase table using the same translation pruning that Moses uses during decoding.
+**/
+
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/scoped_ptr.hpp>
+
+#include "moses/InputPath.h"
+#include "moses/Parameter.h"
+#include "moses/TranslationModel/PhraseDictionary.h"
+#include "moses/StaticData.h"
+
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/double-conversion/double-conversion.h"
+#include "util/exception.hh"
+
+
+using namespace Moses;
+using namespace std;
+
+namespace po = boost::program_options;
+typedef multimap<float,string> Lines;
+
+static void usage(const po::options_description& desc, char** argv) {
+ cerr << "Usage: " + string(argv[0]) + " [options] input-file output-file" << endl;
+ cerr << desc << endl;
+}
+
+//Find top n translations of source, and send them to output
+static void outputTopN(Lines lines, size_t maxPhrases, ostream& out) {
+ size_t count = 0;
+ for (Lines::const_reverse_iterator i = lines.rbegin(); i != lines.rend(); ++i) {
+ out << i->second << endl;
+ ++count;
+ if (count >= maxPhrases) break;
+ }
+}
+/*
+static void outputTopN(const Phrase& sourcePhrase, const multimap<float,const TargetPhrase*>& targetPhrases,
+ size_t maxPhrases, const PhraseDictionary* phraseTable,
+ const vector<FactorType> & input, const vector<FactorType> & output, ostream& out) {
+ size_t count = 0;
+ for (multimap<float,const TargetPhrase*>::const_reverse_iterator i
+ = targetPhrases.rbegin(); i != targetPhrases.rend() && count < maxPhrases; ++i, ++count) {
+ const TargetPhrase* targetPhrase = i->second;
+ out << sourcePhrase.GetStringRep(input);
+ out << " ||| ";
+ out << targetPhrase->GetStringRep(output);
+ out << " ||| ";
+ const ScoreComponentCollection scores = targetPhrase->GetScoreBreakdown();
+ vector<float> phraseScores = scores.GetScoresForProducer(phraseTable);
+ for (size_t j = 0; j < phraseScores.size(); ++j) {
+ out << exp(phraseScores[j]) << " ";
+ }
+ out << "||| ";
+ const AlignmentInfo& align = targetPhrase->GetAlignTerm();
+ for (AlignmentInfo::const_iterator j = align.begin(); j != align.end(); ++j) {
+ out << j->first << "-" << j->second << " ";
+ }
+ out << endl;
+ }
+}*/
+int main(int argc, char** argv)
+{
+ bool help;
+ string input_file;
+ string config_file;
+ size_t maxPhrases = 100;
+
+
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
+ ("input-file,i", po::value<string>(&input_file), "Input file")
+ ("config-file,f", po::value<string>(&config_file), "Config file")
+ ("max-phrases,n", po::value<size_t>(&maxPhrases), "Maximum target phrases per source phrase")
+ ;
+
+ po::options_description cmdline_options;
+ cmdline_options.add(desc);
+ po::variables_map vm;
+ po::parsed_options parsed = po::command_line_parser(argc,argv).
+ options(cmdline_options).run();
+ po::store(parsed, vm);
+ po::notify(vm);
+ if (help) {
+ usage(desc, argv);
+ exit(0);
+ }
+ if (input_file.empty()) {
+ cerr << "ERROR: Please specify an input file" << endl << endl;
+ usage(desc, argv);
+ exit(1);
+ }
+ if (config_file.empty()) {
+ cerr << "ERROR: Please specify a config file" << endl << endl;
+ usage(desc, argv);
+ exit(1);
+ }
+
+ vector<string> mosesargs;
+ mosesargs.push_back(argv[0]);
+ mosesargs.push_back("-f");
+ mosesargs.push_back(config_file);
+
+ boost::scoped_ptr<Parameter> params(new Parameter());
+ char** mosesargv = new char*[mosesargs.size()];
+ for (size_t i = 0; i < mosesargs.size(); ++i) {
+ mosesargv[i] = new char[mosesargs[i].length() + 1];
+ strcpy(mosesargv[i], mosesargs[i].c_str());
+ }
+
+ if (!params->LoadParam(mosesargs.size(), mosesargv)) {
+ params->Explain();
+ exit(1);
+ }
+
+ if (!StaticData::LoadDataStatic(params.get(),argv[0])) {
+ exit(1);
+ }
+
+ const StaticData &staticData = StaticData::Instance();
+
+ //Find the phrase table to manage the target phrases
+ PhraseDictionary* phraseTable = NULL;
+ const vector<FeatureFunction*>& ffs = FeatureFunction::GetFeatureFunctions();
+ for (size_t i = 0; i < ffs.size(); ++i) {
+ PhraseDictionary* maybePhraseTable = dynamic_cast< PhraseDictionary*>(ffs[i]);
+ if (maybePhraseTable) {
+ UTIL_THROW_IF(phraseTable,util::Exception,"Can only score translations with one phrase table");
+ phraseTable = maybePhraseTable;
+ }
+ }
+ UTIL_THROW_IF(!phraseTable,util::Exception,"Unable to find scoring phrase table");
+
+
+ //
+ //Load and prune the phrase table. This is taken (with mods) from moses/TranslationModel/RuleTable/LoaderStandard.cpp
+ //
+
+ std::ostream *progress = NULL;
+ IFVERBOSE(1) progress = &std::cerr;
+ util::FilePiece in(input_file.c_str(), progress);
+
+ // reused variables
+ vector<float> scoreVector;
+ StringPiece line;
+
+ double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
+
+ string previous;
+ Lines lines;
+
+
+ while(true) {
+ try {
+ line = in.ReadLine();
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
+
+ util::TokenIter<util::MultiCharacter> pipes(line, "|||");
+ StringPiece sourcePhraseString(*pipes);
+ StringPiece targetPhraseString(*++pipes);
+ StringPiece scoreString(*++pipes);
+ scoreVector.clear();
+ for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+ int processed;
+ float score = converter.StringToFloat(s->data(), s->length(), &processed);
+ UTIL_THROW_IF2(isnan(score), "Bad score " << *s);
+ scoreVector.push_back(FloorScore(TransformScore(score)));
+ }
+
+ if (sourcePhraseString != previous) {
+ outputTopN(lines, maxPhrases, cout);
+ previous = sourcePhraseString.as_string();
+ lines.clear();
+ }
+
+ ScoreComponentCollection scores;
+ scores.Assign(phraseTable,scoreVector);
+ float score = scores.InnerProduct(staticData.GetAllWeights());
+ lines.insert(pair<float,string>(score,line.as_string()));
+
+ }
+ if (!lines.empty()) {
+ outputTopN(lines, maxPhrases, cout);
+ }
+
+
+
+
+
+ return 0;
+}
diff --git a/misc/queryPhraseTableMin.cpp b/misc/queryPhraseTableMin.cpp
index 723370252..ca4b4b690 100644
--- a/misc/queryPhraseTableMin.cpp
+++ b/misc/queryPhraseTableMin.cpp
@@ -51,9 +51,9 @@ int main(int argc, char **argv)
// const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||");
// UG: I assume "||dummy_string||" means: I'm not using factored data;
// This is now expressed by setting the factor delimiter to the empty string
- const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "");
- const_cast<std::vector<std::string>&>(parameter->GetParam("input-factors")).resize(1, "0");
- const_cast<std::vector<std::string>&>(parameter->GetParam("verbose")).resize(1, "0");
+ const_cast<std::vector<std::string>&>(*parameter->GetParam("factor-delimiter")).resize(1, "");
+ const_cast<std::vector<std::string>&>(*parameter->GetParam("input-factors")).resize(1, "0");
+ const_cast<std::vector<std::string>&>(*parameter->GetParam("verbose")).resize(1, "0");
//const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0");
//const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0");
diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp
deleted file mode 100644
index f4d2366ad..000000000
--- a/moses-chart-cmd/IOWrapper.cpp
+++ /dev/null
@@ -1,1013 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#include <iostream>
-#include <boost/algorithm/string.hpp>
-#include "IOWrapper.h"
-#include "moses/TypeDef.h"
-#include "moses/Util.h"
-#include "moses/WordsRange.h"
-#include "moses/StaticData.h"
-#include "moses/InputFileStream.h"
-#include "moses/Incremental.h"
-#include "moses/TranslationModel/PhraseDictionary.h"
-#include "moses/ChartTranslationOptions.h"
-#include "moses/ChartHypothesis.h"
-#include "moses/FeatureVector.h"
-#include "moses/FF/StatefulFeatureFunction.h"
-#include "moses/FF/StatelessFeatureFunction.h"
-#include "moses/FF/TreeStructureFeature.h"
-#include "moses/PP/TreeStructurePhraseProperty.h"
-#include "util/exception.hh"
-
-using namespace std;
-using namespace Moses;
-
-namespace MosesChartCmd
-{
-
-IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
- , const std::vector<FactorType> &outputFactorOrder
- , const FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &inputFilePath)
- :m_inputFactorOrder(inputFactorOrder)
- ,m_outputFactorOrder(outputFactorOrder)
- ,m_inputFactorUsed(inputFactorUsed)
- ,m_outputSearchGraphStream(NULL)
- ,m_detailedTranslationReportingStream(NULL)
- ,m_detailedTreeFragmentsTranslationReportingStream(NULL)
- ,m_alignmentInfoStream(NULL)
- ,m_unknownsStream(NULL)
- ,m_inputFilePath(inputFilePath)
- ,m_detailOutputCollector(NULL)
- ,m_detailTreeFragmentsOutputCollector(NULL)
- ,m_nBestOutputCollector(NULL)
- ,m_searchGraphOutputCollector(NULL)
- ,m_singleBestOutputCollector(NULL)
- ,m_alignmentInfoCollector(NULL)
- ,m_unknownsCollector(NULL)
-{
- const StaticData &staticData = StaticData::Instance();
-
- if (m_inputFilePath.empty()) {
- m_inputStream = &std::cin;
- } else {
- m_inputStream = new InputFileStream(inputFilePath);
- }
-
- bool suppressSingleBestOutput = false;
-
- if (nBestSize > 0) {
- if (nBestFilePath == "-") {
- m_nBestOutputCollector = new Moses::OutputCollector(&std::cout);
- suppressSingleBestOutput = true;
- } else {
- m_nBestOutputCollector = new Moses::OutputCollector(new std::ofstream(nBestFilePath.c_str()));
- m_nBestOutputCollector->HoldOutputStream();
- }
- }
-
- if (!suppressSingleBestOutput) {
- m_singleBestOutputCollector = new Moses::OutputCollector(&std::cout);
- }
-
- // search graph output
- if (staticData.GetOutputSearchGraph()) {
- string fileName = staticData.GetParam("output-search-graph")[0];
- std::ofstream *file = new std::ofstream;
- m_outputSearchGraphStream = file;
- file->open(fileName.c_str());
- m_searchGraphOutputCollector = new Moses::OutputCollector(m_outputSearchGraphStream);
- }
-
- // detailed translation reporting
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- const std::string &path = staticData.GetDetailedTranslationReportingFilePath();
- m_detailedTranslationReportingStream = new std::ofstream(path.c_str());
- m_detailOutputCollector = new Moses::OutputCollector(m_detailedTranslationReportingStream);
- }
-
- if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
- const std::string &path = staticData.GetDetailedTreeFragmentsTranslationReportingFilePath();
- m_detailedTreeFragmentsTranslationReportingStream = new std::ofstream(path.c_str());
- m_detailTreeFragmentsOutputCollector = new Moses::OutputCollector(m_detailedTreeFragmentsTranslationReportingStream);
- }
-
- if (!staticData.GetAlignmentOutputFile().empty()) {
- m_alignmentInfoStream = new std::ofstream(staticData.GetAlignmentOutputFile().c_str());
- m_alignmentInfoCollector = new Moses::OutputCollector(m_alignmentInfoStream);
- UTIL_THROW_IF2(!m_alignmentInfoStream->good(),
- "File for alignment output could not be opened: " << staticData.GetAlignmentOutputFile());
- }
-
- if (!staticData.GetOutputUnknownsFile().empty()) {
- m_unknownsStream = new std::ofstream(staticData.GetOutputUnknownsFile().c_str());
- m_unknownsCollector = new Moses::OutputCollector(m_unknownsStream);
- UTIL_THROW_IF2(!m_unknownsStream->good(),
- "File for unknowns words could not be opened: " <<
- staticData.GetOutputUnknownsFile());
- }
-}
-
-IOWrapper::~IOWrapper()
-{
- if (!m_inputFilePath.empty()) {
- delete m_inputStream;
- }
- delete m_outputSearchGraphStream;
- delete m_detailedTranslationReportingStream;
- delete m_detailedTreeFragmentsTranslationReportingStream;
- delete m_alignmentInfoStream;
- delete m_unknownsStream;
- delete m_detailOutputCollector;
- delete m_nBestOutputCollector;
- delete m_searchGraphOutputCollector;
- delete m_singleBestOutputCollector;
- delete m_alignmentInfoCollector;
- delete m_unknownsCollector;
-}
-
-void IOWrapper::ResetTranslationId()
-{
- m_translationId = StaticData::Instance().GetStartTranslationId();
-}
-
-InputType*IOWrapper::GetInput(InputType* inputType)
-{
- if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
- if (long x = inputType->GetTranslationId()) {
- if (x>=m_translationId) m_translationId = x+1;
- } else inputType->SetTranslationId(m_translationId++);
-
- return inputType;
- } else {
- delete inputType;
- return NULL;
- }
-}
-
-
-/***
- * print surface factor only for the given phrase
- */
-void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<FactorType> &outputFactorOrder, bool reportAllFactors)
-{
- UTIL_THROW_IF2(outputFactorOrder.size() == 0,
- "Cannot be empty phrase");
- if (reportAllFactors == true) {
- out << phrase;
- } else {
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
- out << *factor;
- UTIL_THROW_IF2(factor == NULL,
- "Empty factor 0 at position " << pos);
-
- for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
- UTIL_THROW_IF2(factor == NULL,
- "Empty factor " << i << " at position " << pos);
-
- out << "|" << *factor;
- }
- out << " ";
- }
- }
-}
-
-void OutputSurface(std::ostream &out, const ChartHypothesis *hypo, const std::vector<FactorType> &outputFactorOrder
- ,bool reportSegmentation, bool reportAllFactors)
-{
- if ( hypo != NULL) {
- //OutputSurface(out, hypo->GetCurrTargetPhrase(), outputFactorOrder, reportAllFactors);
-
- const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
-
- vector<const ChartHypothesis*>::const_iterator iter;
- for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
- const ChartHypothesis *prevHypo = *iter;
-
- OutputSurface(out, prevHypo, outputFactorOrder, reportSegmentation, reportAllFactors);
- }
- }
-}
-
-void IOWrapper::Backtrack(const ChartHypothesis *hypo)
-{
- const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
-
- vector<const ChartHypothesis*>::const_iterator iter;
- for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
- const ChartHypothesis *prevHypo = *iter;
-
- VERBOSE(3,prevHypo->GetId() << " <= ");
- Backtrack(prevHypo);
- }
-}
-
-void IOWrapper::OutputBestHypo(const std::vector<const Factor*>& mbrBestHypo, long /*translationId*/)
-{
- for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
- const Factor *factor = mbrBestHypo[i];
- UTIL_THROW_IF(factor == NULL, util::Exception,
- "No factor at position " << i );
-
- cout << *factor << " ";
- }
-}
-/*
-void OutputInput(std::vector<const Phrase*>& map, const ChartHypothesis* hypo)
-{
- if (hypo->GetPrevHypos())
- {
- OutputInput(map, hypo->GetPrevHypos());
- map[hypo->GetCurrSourceWordsRange().GetStartPos()] = hypo->GetSourcePhrase();
- }
-}
-
-void OutputInput(std::ostream& os, const ChartHypothesis* hypo)
-{
- size_t len = StaticData::Instance().GetInput()->GetSize();
- std::vector<const Phrase*> inp_phrases(len, 0);
- OutputInput(inp_phrases, hypo);
- for (size_t i=0; i<len; ++i)
- if (inp_phrases[i]) os << *inp_phrases[i];
-}
-*/
-
-// Given a hypothesis and sentence, reconstructs the 'application context' --
-// the source RHS symbols of the SCFG rule that was applied, plus their spans.
-void IOWrapper::ReconstructApplicationContext(const ChartHypothesis &hypo,
- const Sentence &sentence,
- ApplicationContext &context)
-{
- context.clear();
- const std::vector<const ChartHypothesis*> &prevHypos = hypo.GetPrevHypos();
- std::vector<const ChartHypothesis*>::const_iterator p = prevHypos.begin();
- std::vector<const ChartHypothesis*>::const_iterator end = prevHypos.end();
- const WordsRange &span = hypo.GetCurrSourceRange();
- size_t i = span.GetStartPos();
- while (i <= span.GetEndPos()) {
- if (p == end || i < (*p)->GetCurrSourceRange().GetStartPos()) {
- // Symbol is a terminal.
- const Word &symbol = sentence.GetWord(i);
- context.push_back(std::make_pair(symbol, WordsRange(i, i)));
- ++i;
- } else {
- // Symbol is a non-terminal.
- const Word &symbol = (*p)->GetTargetLHS();
- const WordsRange &range = (*p)->GetCurrSourceRange();
- context.push_back(std::make_pair(symbol, range));
- i = range.GetEndPos()+1;
- ++p;
- }
- }
-}
-
-
-// Given a hypothesis and sentence, reconstructs the 'application context' --
-// the source RHS symbols of the SCFG rule that was applied, plus their spans.
-void IOWrapper::ReconstructApplicationContext(const search::Applied *applied,
- const Sentence &sentence,
- ApplicationContext &context)
-{
- context.clear();
- const WordsRange &span = applied->GetRange();
- const search::Applied *child = applied->Children();
- size_t i = span.GetStartPos();
- size_t j = 0;
-
- while (i <= span.GetEndPos()) {
- if (j == applied->GetArity() || i < child->GetRange().GetStartPos()) {
- // Symbol is a terminal.
- const Word &symbol = sentence.GetWord(i);
- context.push_back(std::make_pair(symbol, WordsRange(i, i)));
- ++i;
- } else {
- // Symbol is a non-terminal.
- const Word &symbol = static_cast<const TargetPhrase*>(child->GetNote().vp)->GetTargetLHS();
- const WordsRange &range = child->GetRange();
- context.push_back(std::make_pair(symbol, range));
- i = range.GetEndPos()+1;
- ++child;
- ++j;
- }
- }
-}
-
-
-// Emulates the old operator<<(ostream &, const DottedRule &) function. The
-// output format is a bit odd (reverse order and double spacing between symbols)
-// but there are scripts and tools that expect the output of -T to look like
-// that.
-void IOWrapper::WriteApplicationContext(std::ostream &out,
- const ApplicationContext &context)
-{
- assert(!context.empty());
- ApplicationContext::const_reverse_iterator p = context.rbegin();
- while (true) {
- out << p->second << "=" << p->first << " ";
- if (++p == context.rend()) {
- break;
- }
- out << " ";
- }
-}
-
-void IOWrapper::OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
-{
- ReconstructApplicationContext(*hypo, sentence, applicationContext);
- out << "Trans Opt " << translationId
- << " " << hypo->GetCurrSourceRange()
- << ": ";
- WriteApplicationContext(out, applicationContext);
- out << ": " << hypo->GetCurrTargetPhrase().GetTargetLHS()
- << "->" << hypo->GetCurrTargetPhrase()
- << " " << hypo->GetTotalScore() << hypo->GetScoreBreakdown();
-}
-
-void IOWrapper::OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Sentence &sentence, long translationId)
-{
- ReconstructApplicationContext(applied, sentence, applicationContext);
- const TargetPhrase &phrase = *static_cast<const TargetPhrase*>(applied->GetNote().vp);
- out << "Trans Opt " << translationId
- << " " << applied->GetRange()
- << ": ";
- WriteApplicationContext(out, applicationContext);
- out << ": " << phrase.GetTargetLHS()
- << "->" << phrase
- << " " << applied->GetScore(); // << hypo->GetScoreBreakdown() TODO: missing in incremental search hypothesis
-}
-
-
-void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
-{
- if (hypo != NULL) {
- OutputTranslationOption(out, applicationContext, hypo, sentence, translationId);
- out << std::endl;
- }
-
- // recursive
- const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
- std::vector<const ChartHypothesis*>::const_iterator iter;
- for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
- const ChartHypothesis *prevHypo = *iter;
- OutputTranslationOptions(out, applicationContext, prevHypo, sentence, translationId);
- }
-}
-
-
-void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Sentence &sentence, long translationId)
-{
- if (applied != NULL) {
- OutputTranslationOption(out, applicationContext, applied, sentence, translationId);
- out << std::endl;
- }
-
- // recursive
- const search::Applied *child = applied->Children();
- for (size_t i = 0; i < applied->GetArity(); i++) {
- OutputTranslationOptions(out, applicationContext, child++, sentence, translationId);
- }
-}
-
-void IOWrapper::OutputTreeFragmentsTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
-{
-
- if (hypo != NULL) {
- OutputTranslationOption(out, applicationContext, hypo, sentence, translationId);
-
- const TargetPhrase &currTarPhr = hypo->GetCurrTargetPhrase();
- boost::shared_ptr<PhraseProperty> property;
-
- out << " ||| ";
- if (currTarPhr.GetProperty("Tree", property)) {
- out << " " << property->GetValueString();
- } else {
- out << " " << "noTreeInfo";
- }
- out << std::endl;
- }
-
- // recursive
- const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
- std::vector<const ChartHypothesis*>::const_iterator iter;
- for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
- const ChartHypothesis *prevHypo = *iter;
- OutputTreeFragmentsTranslationOptions(out, applicationContext, prevHypo, sentence, translationId);
- }
-}
-
-void IOWrapper::OutputTreeFragmentsTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Sentence &sentence, long translationId)
-{
-
- if (applied != NULL) {
- OutputTranslationOption(out, applicationContext, applied, sentence, translationId);
-
- const TargetPhrase &currTarPhr = *static_cast<const TargetPhrase*>(applied->GetNote().vp);
- boost::shared_ptr<PhraseProperty> property;
-
- out << " ||| ";
- if (currTarPhr.GetProperty("Tree", property)) {
- out << " " << property->GetValueString();
- } else {
- out << " " << "noTreeInfo";
- }
- out << std::endl;
- }
-
- // recursive
- const search::Applied *child = applied->Children();
- for (size_t i = 0; i < applied->GetArity(); i++) {
- OutputTreeFragmentsTranslationOptions(out, applicationContext, child++, sentence, translationId);
- }
-}
-
-void IOWrapper::OutputDetailedTranslationReport(
- const ChartHypothesis *hypo,
- const Sentence &sentence,
- long translationId)
-{
- if (hypo == NULL) {
- return;
- }
- std::ostringstream out;
- ApplicationContext applicationContext;
-
- OutputTranslationOptions(out, applicationContext, hypo, sentence, translationId);
- UTIL_THROW_IF2(m_detailOutputCollector == NULL,
- "No ouput file for detailed reports specified");
- m_detailOutputCollector->Write(translationId, out.str());
-}
-
-void IOWrapper::OutputDetailedTranslationReport(
- const search::Applied *applied,
- const Sentence &sentence,
- long translationId)
-{
- if (applied == NULL) {
- return;
- }
- std::ostringstream out;
- ApplicationContext applicationContext;
-
- OutputTranslationOptions(out, applicationContext, applied, sentence, translationId);
- UTIL_THROW_IF2(m_detailOutputCollector == NULL,
- "No ouput file for detailed reports specified");
- m_detailOutputCollector->Write(translationId, out.str());
-}
-
-void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
- const ChartHypothesis *hypo,
- const Sentence &sentence,
- long translationId)
-{
- if (hypo == NULL) {
- return;
- }
- std::ostringstream out;
- ApplicationContext applicationContext;
-
- OutputTreeFragmentsTranslationOptions(out, applicationContext, hypo, sentence, translationId);
- UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
- "No output file for tree fragments specified");
-
- //Tree of full sentence
- const StatefulFeatureFunction* treeStructure = StaticData::Instance().GetTreeStructure();
- if (treeStructure != NULL) {
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for( size_t i=0; i<sff.size(); i++ ) {
- if (sff[i] == treeStructure) {
- const TreeState* tree = dynamic_cast<const TreeState*>(hypo->GetFFState(i));
- out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n";
- break;
- }
- }
- }
-
- m_detailTreeFragmentsOutputCollector->Write(translationId, out.str());
-
-}
-
-void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
- const search::Applied *applied,
- const Sentence &sentence,
- long translationId)
-{
- if (applied == NULL) {
- return;
- }
- std::ostringstream out;
- ApplicationContext applicationContext;
-
- OutputTreeFragmentsTranslationOptions(out, applicationContext, applied, sentence, translationId);
- UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
- "No output file for tree fragments specified");
-
- //Tree of full sentence
- //TODO: incremental search doesn't support stateful features
-
- m_detailTreeFragmentsOutputCollector->Write(translationId, out.str());
-
-}
-
-//DIMw
-void IOWrapper::OutputDetailedAllTranslationReport(
- const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList,
- const ChartManager &manager,
- const Sentence &sentence,
- long translationId)
-{
- std::ostringstream out;
- ApplicationContext applicationContext;
-
- const ChartCellCollection& cells = manager.GetChartCellCollection();
- size_t size = manager.GetSource().GetSize();
- for (size_t width = 1; width <= size; ++width) {
- for (size_t startPos = 0; startPos <= size-width; ++startPos) {
- size_t endPos = startPos + width - 1;
- WordsRange range(startPos, endPos);
- const ChartCell& cell = cells.Get(range);
- const HypoList* hyps = cell.GetAllSortedHypotheses();
- out << "Chart Cell [" << startPos << ".." << endPos << "]" << endl;
- HypoList::const_iterator iter;
- size_t c = 1;
- for (iter = hyps->begin(); iter != hyps->end(); ++iter) {
- out << "----------------Item " << c++ << " ---------------------"
- << endl;
- OutputTranslationOptions(out, applicationContext, *iter,
- sentence, translationId);
- }
- }
- }
- UTIL_THROW_IF2(m_detailAllOutputCollector == NULL,
- "No output file for details specified");
- m_detailAllOutputCollector->Write(translationId, out.str());
-}
-
-void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId)
-{
- if (!m_singleBestOutputCollector)
- return;
- std::ostringstream out;
- IOWrapper::FixPrecision(out);
- if (hypo != NULL) {
- VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
- VERBOSE(3,"Best path: ");
- Backtrack(hypo);
- VERBOSE(3,"0" << std::endl);
-
- if (StaticData::Instance().GetOutputHypoScore()) {
- out << hypo->GetTotalScore() << " ";
- }
-
- if (StaticData::Instance().IsPathRecoveryEnabled()) {
- out << "||| ";
- }
- Phrase outPhrase(ARRAY_SIZE_INCR);
- hypo->GetOutputPhrase(outPhrase);
-
- // delete 1st & last
- UTIL_THROW_IF2(outPhrase.GetSize() < 2,
- "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
-
- outPhrase.RemoveWord(0);
- outPhrase.RemoveWord(outPhrase.GetSize() - 1);
-
- const std::vector<FactorType> outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
- string output = outPhrase.GetStringRep(outputFactorOrder);
- out << output << endl;
- } else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
-
- if (StaticData::Instance().GetOutputHypoScore()) {
- out << "0 ";
- }
-
- out << endl;
- }
- m_singleBestOutputCollector->Write(translationId, out.str());
-}
-
-void IOWrapper::OutputBestHypo(search::Applied applied, long translationId)
-{
- if (!m_singleBestOutputCollector) return;
- std::ostringstream out;
- IOWrapper::FixPrecision(out);
- if (StaticData::Instance().GetOutputHypoScore()) {
- out << applied.GetScore() << ' ';
- }
- Phrase outPhrase;
- Incremental::ToPhrase(applied, outPhrase);
- // delete 1st & last
- UTIL_THROW_IF2(outPhrase.GetSize() < 2,
- "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
- outPhrase.RemoveWord(0);
- outPhrase.RemoveWord(outPhrase.GetSize() - 1);
- out << outPhrase.GetStringRep(StaticData::Instance().GetOutputFactorOrder());
- out << '\n';
- m_singleBestOutputCollector->Write(translationId, out.str());
-
- VERBOSE(1,"BEST TRANSLATION: " << outPhrase << "[total=" << applied.GetScore() << "]" << endl);
-}
-
-void IOWrapper::OutputBestNone(long translationId)
-{
- if (!m_singleBestOutputCollector) return;
- if (StaticData::Instance().GetOutputHypoScore()) {
- m_singleBestOutputCollector->Write(translationId, "0 \n");
- } else {
- m_singleBestOutputCollector->Write(translationId, "\n");
- }
-}
-
-void IOWrapper::OutputAllFeatureScores(const ScoreComponentCollection &features, std::ostream &out)
-{
- std::string lastName = "";
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for( size_t i=0; i<sff.size(); i++ ) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->GetScoreProducerDescription() != "BleuScoreFeature"
- && ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- for( size_t i=0; i<slf.size(); i++ ) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
-} // namespace
-
-void IOWrapper::OutputFeatureScores( std::ostream& out, const ScoreComponentCollection &features, const FeatureFunction *ff, std::string &lastName )
-{
- const StaticData &staticData = StaticData::Instance();
- bool labeledOutput = staticData.IsLabeledNBestList();
-
- // regular features (not sparse)
- if (ff->GetNumScoreComponents() != 0) {
- if( labeledOutput && lastName != ff->GetScoreProducerDescription() ) {
- lastName = ff->GetScoreProducerDescription();
- out << " " << lastName << "=";
- }
- vector<float> scores = features.GetScoresForProducer( ff );
- for (size_t j = 0; j<scores.size(); ++j) {
- out << " " << scores[j];
- }
- }
-
- // sparse features
- const FVector scores = features.GetVectorForProducer( ff );
- for(FVector::FNVmap::const_iterator i = scores.cbegin(); i != scores.cend(); i++) {
- out << " " << i->first << "= " << i->second;
- }
-}
-
-void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList,
- long translationId)
-{
- std::ostringstream out;
-
- if (m_nBestOutputCollector->OutputIsCout()) {
- // Set precision only if we're writing the n-best list to cout. This is to
- // preserve existing behaviour, but should probably be done either way.
- IOWrapper::FixPrecision(out);
- }
-
- bool includeWordAlignment =
- StaticData::Instance().PrintAlignmentInfoInNbest();
-
- for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
- p != nBestList.end(); ++p) {
- const ChartKBestExtractor::Derivation &derivation = **p;
-
- // get the derivation's target-side yield
- Phrase outputPhrase = ChartKBestExtractor::GetOutputPhrase(derivation);
-
- // delete <s> and </s>
- UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
- "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
- outputPhrase.RemoveWord(0);
- outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
-
- // print the translation ID, surface factors, and scores
- out << translationId << " ||| ";
- OutputSurface(out, outputPhrase, m_outputFactorOrder, false);
- out << " ||| ";
- OutputAllFeatureScores(derivation.scoreBreakdown, out);
- out << " ||| " << derivation.score;
-
- // optionally, print word alignments
- if (includeWordAlignment) {
- out << " ||| ";
- Alignments align;
- OutputAlignmentNBest(align, derivation, 0);
- for (Alignments::const_iterator q = align.begin(); q != align.end();
- ++q) {
- out << q->first << "-" << q->second << " ";
- }
- }
-
- out << std::endl;
- }
-
- assert(m_nBestOutputCollector);
- m_nBestOutputCollector->Write(translationId, out.str());
-}
-
-void IOWrapper::OutputNBestList(const std::vector<search::Applied> &nbest, long translationId)
-{
- std::ostringstream out;
- // wtf? copied from the original OutputNBestList
- if (m_nBestOutputCollector->OutputIsCout()) {
- IOWrapper::FixPrecision(out);
- }
- Phrase outputPhrase;
- ScoreComponentCollection features;
- for (std::vector<search::Applied>::const_iterator i = nbest.begin(); i != nbest.end(); ++i) {
- Incremental::PhraseAndFeatures(*i, outputPhrase, features);
- // <s> and </s>
- UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
- "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
-
- outputPhrase.RemoveWord(0);
- outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
- out << translationId << " ||| ";
- OutputSurface(out, outputPhrase, m_outputFactorOrder, false);
- out << " ||| ";
- OutputAllFeatureScores(features, out);
- out << " ||| " << i->GetScore() << '\n';
- }
- out << std::flush;
- assert(m_nBestOutputCollector);
- m_nBestOutputCollector->Write(translationId, out.str());
-}
-
-void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
-{
- stream.setf(std::ios::fixed);
- stream.precision(size);
-}
-
-template <class T>
-void ShiftOffsets(vector<T> &offsets, T shift)
-{
- T currPos = shift;
- for (size_t i = 0; i < offsets.size(); ++i) {
- if (offsets[i] == 0) {
- offsets[i] = currPos;
- ++currPos;
- } else {
- currPos += offsets[i];
- }
- }
-}
-
-size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
-{
- size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
- const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
- for (size_t i = 0; i < prevHypos.size(); ++i) {
- size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
- ret -= (childSize - 1);
- }
- return ret;
-}
-
-size_t IOWrapper::OutputAlignmentNBest(
- Alignments &retAlign,
- const Moses::ChartKBestExtractor::Derivation &derivation,
- size_t startTarget)
-{
- const ChartHypothesis &hypo = derivation.edge.head->hypothesis;
-
- size_t totalTargetSize = 0;
- size_t startSource = hypo.GetCurrSourceRange().GetStartPos();
-
- const TargetPhrase &tp = hypo.GetCurrTargetPhrase();
-
- size_t thisSourceSize = CalcSourceSize(&hypo);
-
- // position of each terminal word in translation rule, irrespective of alignment
- // if non-term, number is undefined
- vector<size_t> sourceOffsets(thisSourceSize, 0);
- vector<size_t> targetOffsets(tp.GetSize(), 0);
-
- const AlignmentInfo &aiNonTerm = hypo.GetCurrTargetPhrase().GetAlignNonTerm();
- vector<size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
- const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap();
-
- UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(),
- "Error");
-
- size_t targetInd = 0;
- for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
- if (tp.GetWord(targetPos).IsNonTerminal()) {
- UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
- size_t sourceInd = targetPos2SourceInd[targetPos];
- size_t sourcePos = sourceInd2pos[sourceInd];
-
- const Moses::ChartKBestExtractor::Derivation &subderivation =
- *derivation.subderivations[sourceInd];
-
- // calc source size
- size_t sourceSize = subderivation.edge.head->hypothesis.GetCurrSourceRange().GetNumWordsCovered();
- sourceOffsets[sourcePos] = sourceSize;
-
- // calc target size.
- // Recursively look thru child hypos
- size_t currStartTarget = startTarget + totalTargetSize;
- size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
- currStartTarget);
- targetOffsets[targetPos] = targetSize;
-
- totalTargetSize += targetSize;
- ++targetInd;
- } else {
- ++totalTargetSize;
- }
- }
-
- // convert position within translation rule to absolute position within
- // source sentence / output sentence
- ShiftOffsets(sourceOffsets, startSource);
- ShiftOffsets(targetOffsets, startTarget);
-
- // get alignments from this hypo
- const AlignmentInfo &aiTerm = hypo.GetCurrTargetPhrase().GetAlignTerm();
-
- // add to output arg, offsetting by source & target
- AlignmentInfo::const_iterator iter;
- for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
- const std::pair<size_t,size_t> &align = *iter;
- size_t relSource = align.first;
- size_t relTarget = align.second;
- size_t absSource = sourceOffsets[relSource];
- size_t absTarget = targetOffsets[relTarget];
-
- pair<size_t, size_t> alignPoint(absSource, absTarget);
- pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
- UTIL_THROW_IF2(!ret.second, "Error");
- }
-
- return totalTargetSize;
-}
-
-void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothesis *hypo)
-{
- ostringstream out;
-
- if (hypo) {
- Alignments retAlign;
- OutputAlignment(retAlign, hypo, 0);
-
- // output alignments
- Alignments::const_iterator iter;
- for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
- const pair<size_t, size_t> &alignPoint = *iter;
- out << alignPoint.first << "-" << alignPoint.second << " ";
- }
- }
- out << endl;
-
- m_alignmentInfoCollector->Write(translationId, out.str());
-}
-
-void IOWrapper::OutputUnknowns(const std::vector<Moses::Phrase*> &unknowns,
- long translationId)
-{
- std::ostringstream out;
- for (std::size_t i = 0; i < unknowns.size(); ++i) {
- out << *(unknowns[i]);
- }
- out << std::endl;
- m_unknownsCollector->Write(translationId, out.str());
-}
-
-size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget)
-{
- size_t totalTargetSize = 0;
- size_t startSource = hypo->GetCurrSourceRange().GetStartPos();
-
- const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
-
- size_t thisSourceSize = CalcSourceSize(hypo);
-
- // position of each terminal word in translation rule, irrespective of alignment
- // if non-term, number is undefined
- vector<size_t> sourceOffsets(thisSourceSize, 0);
- vector<size_t> targetOffsets(tp.GetSize(), 0);
-
- const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
-
- const AlignmentInfo &aiNonTerm = hypo->GetCurrTargetPhrase().GetAlignNonTerm();
- vector<size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
- const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap();
-
- UTIL_THROW_IF2(sourceInd2pos.size() != prevHypos.size(), "Error");
-
- size_t targetInd = 0;
- for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
- if (tp.GetWord(targetPos).IsNonTerminal()) {
- UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
- size_t sourceInd = targetPos2SourceInd[targetPos];
- size_t sourcePos = sourceInd2pos[sourceInd];
-
- const ChartHypothesis *prevHypo = prevHypos[sourceInd];
-
- // calc source size
- size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
- sourceOffsets[sourcePos] = sourceSize;
-
- // calc target size.
- // Recursively look thru child hypos
- size_t currStartTarget = startTarget + totalTargetSize;
- size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
- targetOffsets[targetPos] = targetSize;
-
- totalTargetSize += targetSize;
- ++targetInd;
- } else {
- ++totalTargetSize;
- }
- }
-
- // convert position within translation rule to absolute position within
- // source sentence / output sentence
- ShiftOffsets(sourceOffsets, startSource);
- ShiftOffsets(targetOffsets, startTarget);
-
- // get alignments from this hypo
- const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
-
- // add to output arg, offsetting by source & target
- AlignmentInfo::const_iterator iter;
- for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
- const std::pair<size_t,size_t> &align = *iter;
- size_t relSource = align.first;
- size_t relTarget = align.second;
- size_t absSource = sourceOffsets[relSource];
- size_t absTarget = targetOffsets[relTarget];
-
- pair<size_t, size_t> alignPoint(absSource, absTarget);
- pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
- UTIL_THROW_IF2(!ret.second, "Error");
-
- }
-
- return totalTargetSize;
-}
-
-void IOWrapper::OutputAlignment(vector< set<size_t> > &retAlignmentsS2T, const AlignmentInfo &ai)
-{
- typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
- AlignVec alignments = ai.GetSortedAlignments();
-
- AlignVec::const_iterator it;
- for (it = alignments.begin(); it != alignments.end(); ++it) {
- const std::pair<size_t,size_t> &alignPoint = **it;
-
- UTIL_THROW_IF2(alignPoint.first >= retAlignmentsS2T.size(), "Error");
- pair<set<size_t>::iterator, bool> ret = retAlignmentsS2T[alignPoint.first].insert(alignPoint.second);
- UTIL_THROW_IF2(!ret.second, "Error");
- }
-}
-
-}
-
diff --git a/moses-chart-cmd/IOWrapper.h b/moses-chart-cmd/IOWrapper.h
deleted file mode 100644
index b54ee224d..000000000
--- a/moses-chart-cmd/IOWrapper.h
+++ /dev/null
@@ -1,150 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#pragma once
-
-#include <fstream>
-#include <vector>
-#include <set>
-#include "moses/TypeDef.h"
-#include "moses/Sentence.h"
-#include "moses/FactorTypeSet.h"
-#include "moses/ChartKBestExtractor.h"
-#include "moses/OutputCollector.h"
-#include "moses/ChartHypothesis.h"
-#include "search/applied.hh"
-#include "moses/ChartManager.h"
-
-namespace Moses
-{
-class FactorCollection;
-class ScoreComponentCollection;
-}
-
-namespace MosesChartCmd
-{
-
-/** Helper class that holds misc variables to write data out to command line.
- */
-class IOWrapper
-{
-protected:
- typedef std::vector<std::pair<Moses::Word, Moses::WordsRange> > ApplicationContext;
-
- long m_translationId;
-
- const std::vector<Moses::FactorType> &m_inputFactorOrder;
- const std::vector<Moses::FactorType> &m_outputFactorOrder;
- const Moses::FactorMask &m_inputFactorUsed;
- std::ostream *m_outputSearchGraphStream;
- std::ostream *m_detailedTranslationReportingStream;
- std::ostream *m_detailedTreeFragmentsTranslationReportingStream;
- //DIMw
- std::ostream *m_detailedAllTranslationReportingStream;
- std::ostream *m_alignmentInfoStream;
- std::ostream *m_unknownsStream;
- std::string m_inputFilePath;
- std::istream *m_inputStream;
- Moses::OutputCollector *m_detailOutputCollector;
- Moses::OutputCollector *m_detailTreeFragmentsOutputCollector;
- //DIMw
- Moses::OutputCollector *m_detailAllOutputCollector;
- Moses::OutputCollector *m_nBestOutputCollector;
- Moses::OutputCollector *m_searchGraphOutputCollector;
- Moses::OutputCollector *m_singleBestOutputCollector;
- Moses::OutputCollector *m_alignmentInfoCollector;
- Moses::OutputCollector *m_unknownsCollector;
-
- typedef std::set< std::pair<size_t, size_t> > Alignments;
- std::size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartKBestExtractor::Derivation &derivation, std::size_t startTarget);
- size_t OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget);
- void OutputAlignment(std::vector< std::set<size_t> > &retAlignmentsS2T, const Moses::AlignmentInfo &ai);
- void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void OutputTreeFragmentsTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputTreeFragmentsTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void ReconstructApplicationContext(const Moses::ChartHypothesis &hypo,
- const Moses::Sentence &sentence,
- ApplicationContext &context);
- void ReconstructApplicationContext(const search::Applied *applied,
- const Moses::Sentence &sentence,
- ApplicationContext &context);
- void WriteApplicationContext(std::ostream &out,
- const ApplicationContext &context);
-
- void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out);
- void OutputFeatureScores( std::ostream& out
- , const Moses::ScoreComponentCollection &features
- , const Moses::FeatureFunction *ff
- , std::string &lastName );
-
-public:
- IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &inputFilePath="");
- ~IOWrapper();
-
- Moses::InputType* GetInput(Moses::InputType *inputType);
- void OutputBestHypo(const Moses::ChartHypothesis *hypo, long translationId);
- void OutputBestHypo(search::Applied applied, long translationId);
- void OutputBestHypo(const std::vector<const Moses::Factor*>& mbrBestHypo, long translationId);
- void OutputBestNone(long translationId);
- void OutputNBestList(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, long translationId);
- void OutputNBestList(const std::vector<search::Applied> &nbest, long translationId);
- void OutputDetailedTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputDetailedTranslationReport(const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void OutputDetailedTreeFragmentsTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
- void OutputDetailedTreeFragmentsTranslationReport(const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
- void OutputDetailedAllTranslationReport(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, const Moses::ChartManager &manager, const Moses::Sentence &sentence, long translationId);
- void Backtrack(const Moses::ChartHypothesis *hypo);
-
- void ResetTranslationId();
-
- Moses::OutputCollector *GetSearchGraphOutputCollector() {
- return m_searchGraphOutputCollector;
- }
-
- void OutputAlignment(size_t translationId , const Moses::ChartHypothesis *hypo);
- void OutputUnknowns(const std::vector<Moses::Phrase*> &, long);
-
- static void FixPrecision(std::ostream &, size_t size=3);
-};
-
-}
diff --git a/moses-chart-cmd/Jamfile b/moses-chart-cmd/Jamfile
deleted file mode 100644
index ba107fa67..000000000
--- a/moses-chart-cmd/Jamfile
+++ /dev/null
@@ -1,2 +0,0 @@
-exe moses_chart : Main.cpp mbr.cpp IOWrapper.cpp TranslationAnalysis.cpp ../moses//moses $(TOP)//boost_iostreams ;
-
diff --git a/moses-chart-cmd/Main.cpp b/moses-chart-cmd/Main.cpp
deleted file mode 100644
index 8773f01f6..000000000
--- a/moses-chart-cmd/Main.cpp
+++ /dev/null
@@ -1,365 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#ifdef WIN32
-// Include Visual Leak Detector
-//#include <vld.h>
-#endif
-
-#include <exception>
-#include <fstream>
-#include "Main.h"
-#include "TranslationAnalysis.h"
-#include "mbr.h"
-#include "IOWrapper.h"
-
-#include "moses/FactorCollection.h"
-#include "moses/Manager.h"
-#include "moses/Phrase.h"
-#include "moses/Util.h"
-#include "moses/Timer.h"
-#include "moses/Sentence.h"
-#include "moses/ConfusionNet.h"
-#include "moses/WordLattice.h"
-#include "moses/TreeInput.h"
-#include "moses/ThreadPool.h"
-#include "moses/ChartManager.h"
-#include "moses/ChartHypothesis.h"
-#include "moses/Incremental.h"
-#include "moses/FF/StatefulFeatureFunction.h"
-#include "moses/FF/StatelessFeatureFunction.h"
-
-#include "util/usage.hh"
-#include "util/exception.hh"
-
-using namespace std;
-using namespace Moses;
-using namespace MosesChartCmd;
-
-/** Enforce rounding */
-void fix(std::ostream& stream, size_t size)
-{
- stream.setf(std::ios::fixed);
- stream.precision(size);
-}
-
-/**
- * Translates a sentence.
- **/
-class TranslationTask : public Task
-{
-public:
- TranslationTask(InputType *source, IOWrapper &ioWrapper)
- : m_source(source)
- , m_ioWrapper(ioWrapper) {
- }
-
- ~TranslationTask() {
- delete m_source;
- }
-
- void Run() {
- const StaticData &staticData = StaticData::Instance();
- const size_t translationId = m_source->GetTranslationId();
-
- VERBOSE(2,"\nTRANSLATING(" << translationId << "): " << *m_source);
-
- if (staticData.GetSearchAlgorithm() == ChartIncremental) {
- Incremental::Manager manager(*m_source);
- const std::vector<search::Applied> &nbest = manager.ProcessSentence();
- if (!nbest.empty()) {
- m_ioWrapper.OutputBestHypo(nbest[0], translationId);
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- m_ioWrapper.OutputDetailedTranslationReport(&nbest[0], sentence, translationId);
- }
- if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- m_ioWrapper.OutputDetailedTreeFragmentsTranslationReport(&nbest[0], sentence, translationId);
- }
- } else {
- m_ioWrapper.OutputBestNone(translationId);
- }
- if (staticData.GetNBestSize() > 0)
- m_ioWrapper.OutputNBestList(nbest, translationId);
- return;
- }
-
- ChartManager manager(*m_source);
- manager.ProcessSentence();
-
- UTIL_THROW_IF2(staticData.UseMBR(), "Cannot use MBR");
-
- // 1-best
- const ChartHypothesis *bestHypo = manager.GetBestHypothesis();
- m_ioWrapper.OutputBestHypo(bestHypo, translationId);
- IFVERBOSE(2) {
- PrintUserTime("Best Hypothesis Generation Time:");
- }
-
- if (!staticData.GetAlignmentOutputFile().empty()) {
- m_ioWrapper.OutputAlignment(translationId, bestHypo);
- }
-
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- m_ioWrapper.OutputDetailedTranslationReport(bestHypo, sentence, translationId);
- }
- if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- m_ioWrapper.OutputDetailedTreeFragmentsTranslationReport(bestHypo, sentence, translationId);
- }
- if (!staticData.GetOutputUnknownsFile().empty()) {
- m_ioWrapper.OutputUnknowns(manager.GetParser().GetUnknownSources(),
- translationId);
- }
-
- //DIMw
- if (staticData.IsDetailedAllTranslationReportingEnabled()) {
- const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
- size_t nBestSize = staticData.GetNBestSize();
- std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
- manager.CalcNBest(nBestSize, nBestList, staticData.GetDistinctNBest());
- m_ioWrapper.OutputDetailedAllTranslationReport(nBestList, manager, sentence, translationId);
- }
-
- // n-best
- size_t nBestSize = staticData.GetNBestSize();
- if (nBestSize > 0) {
- VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl);
- std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
- manager.CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest());
- m_ioWrapper.OutputNBestList(nBestList, translationId);
- IFVERBOSE(2) {
- PrintUserTime("N-Best Hypotheses Generation Time:");
- }
- }
-
- if (staticData.GetOutputSearchGraph()) {
- std::ostringstream out;
- manager.GetSearchGraph(translationId, out);
- OutputCollector *oc = m_ioWrapper.GetSearchGraphOutputCollector();
- UTIL_THROW_IF2(oc == NULL, "File for search graph output not specified");
- oc->Write(translationId, out.str());
- }
-
- IFVERBOSE(2) {
- PrintUserTime("Sentence Decoding Time:");
- }
- manager.CalcDecoderStatistics();
- }
-
-private:
- // Non-copyable: copy constructor and assignment operator not implemented.
- TranslationTask(const TranslationTask &);
- TranslationTask &operator=(const TranslationTask &);
-
- InputType *m_source;
- IOWrapper &m_ioWrapper;
-};
-
-bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
-{
- delete source;
- switch(inputType) {
- case SentenceInput:
- source = ioWrapper.GetInput(new Sentence);
- break;
- case ConfusionNetworkInput:
- source = ioWrapper.GetInput(new ConfusionNet);
- break;
- case WordLatticeInput:
- source = ioWrapper.GetInput(new WordLattice);
- break;
- case TreeInputType:
- source = ioWrapper.GetInput(new TreeInput);
- break;
- default:
- TRACE_ERR("Unknown input type: " << inputType << "\n");
- }
- return (source ? true : false);
-}
-static void PrintFeatureWeight(const FeatureFunction* ff)
-{
- cout << ff->GetScoreProducerDescription() << "=";
- size_t numScoreComps = ff->GetNumScoreComponents();
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- for (size_t i = 0; i < numScoreComps; ++i) {
- cout << " " << values[i];
- }
- cout << endl;
-
-}
-
-static void ShowWeights()
-{
- fix(cout,6);
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
-
- for (size_t i = 0; i < sff.size(); ++i) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- } else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- } else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
-}
-
-
-int main(int argc, char* argv[])
-{
- try {
- IFVERBOSE(1) {
- TRACE_ERR("command: ");
- for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
- TRACE_ERR(endl);
- }
-
- IOWrapper::FixPrecision(cout);
- IOWrapper::FixPrecision(cerr);
-
- // load data structures
- Parameter parameter;
- if (!parameter.LoadParam(argc, argv)) {
- return EXIT_FAILURE;
- }
-
- const StaticData &staticData = StaticData::Instance();
- if (!StaticData::LoadDataStatic(&parameter, argv[0]))
- return EXIT_FAILURE;
-
- if (parameter.isParamSpecified("show-weights")) {
- ShowWeights();
- exit(0);
- }
-
- UTIL_THROW_IF2(!staticData.IsChart(), "Must be SCFG model");
-
- // set up read/writing class
- IOWrapper *ioWrapper = GetIOWrapper(staticData);
-
- // check on weights
- const ScoreComponentCollection& weights = staticData.GetAllWeights();
- IFVERBOSE(2) {
- TRACE_ERR("The global weight vector looks like this: ");
- TRACE_ERR(weights);
- TRACE_ERR("\n");
- }
-
- if (ioWrapper == NULL)
- return EXIT_FAILURE;
-
-#ifdef WITH_THREADS
- ThreadPool pool(staticData.ThreadCount());
-#endif
-
- // read each sentence & decode
- InputType *source=0;
- while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
- IFVERBOSE(1)
- ResetUserTime();
- TranslationTask *task = new TranslationTask(source, *ioWrapper);
- source = NULL; // task will delete source
-#ifdef WITH_THREADS
- pool.Submit(task); // pool will delete task
-#else
- task->Run();
- delete task;
-#endif
- }
-
-#ifdef WITH_THREADS
- pool.Stop(true); // flush remaining jobs
-#endif
-
- delete ioWrapper;
- FeatureFunction::Destroy();
-
- IFVERBOSE(1)
- PrintUserTime("End.");
-
- } catch (const std::exception &e) {
- std::cerr << "Exception: " << e.what() << std::endl;
- return EXIT_FAILURE;
- }
-
- IFVERBOSE(1) util::PrintUsage(std::cerr);
-
-#ifndef EXIT_RETURN
- //This avoids that detructors are called (it can take a long time)
- exit(EXIT_SUCCESS);
-#else
- return EXIT_SUCCESS;
-#endif
-}
-
-IOWrapper *GetIOWrapper(const StaticData &staticData)
-{
- IOWrapper *ioWrapper;
- const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
- ,&outputFactorOrder = staticData.GetOutputFactorOrder();
- FactorMask inputFactorUsed(inputFactorOrder);
-
- // io
- if (staticData.GetParam("input-file").size() == 1) {
- VERBOSE(2,"IO from File" << endl);
- string filePath = staticData.GetParam("input-file")[0];
-
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath()
- , filePath);
- } else {
- VERBOSE(1,"IO from STDOUT/STDIN" << endl);
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath());
- }
- ioWrapper->ResetTranslationId();
-
- IFVERBOSE(1)
- PrintUserTime("Created input-output object");
-
- return ioWrapper;
-}
diff --git a/moses-chart-cmd/Main.h b/moses-chart-cmd/Main.h
deleted file mode 100644
index 319e3889c..000000000
--- a/moses-chart-cmd/Main.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#pragma once
-
-#include "moses/StaticData.h"
-
-namespace MosesChartCmd
-{
-class IOWrapper;
-}
-
-int main(int argc, char* argv[]);
-MosesChartCmd::IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
diff --git a/moses-chart-cmd/TranslationAnalysis.cpp b/moses-chart-cmd/TranslationAnalysis.cpp
deleted file mode 100644
index 964050fad..000000000
--- a/moses-chart-cmd/TranslationAnalysis.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// $Id$
-
-#include <iostream>
-#include <sstream>
-#include <algorithm>
-
-#include "TranslationAnalysis.h"
-
-#include "moses/StaticData.h"
-#include "moses/TranslationOption.h"
-#include "moses/DecodeStepTranslation.h"
-#include "moses/FF/StatefulFeatureFunction.h"
-#include "moses/FF/StatelessFeatureFunction.h"
-#include "moses/LM/Base.h"
-
-using namespace std;
-using namespace Moses;
-
-namespace TranslationAnalysis
-{
-
-void PrintTranslationAnalysis(ostream & /* os */, const Hypothesis* /* hypo */)
-{
- /*
- os << endl << "TRANSLATION HYPOTHESIS DETAILS:" << endl;
- queue<const Hypothesis*> translationPath;
- while (hypo)
- {
- translationPath.push(hypo);
- hypo = hypo->GetPrevHypo();
- }
-
- while (!translationPath.empty())
- {
- hypo = translationPath.front();
- translationPath.pop();
- const TranslationOption *transOpt = hypo->GetTranslationOption();
- if (transOpt != NULL)
- {
- os << hypo->GetCurrSourceWordsRange() << " ";
- for (size_t decodeStepId = 0; decodeStepId < DecodeStepTranslation::GetNumTransStep(); ++decodeStepId)
- os << decodeStepId << "=" << transOpt->GetSubRangeCount(decodeStepId) << ",";
- os << *transOpt << endl;
- }
- }
-
- os << "END TRANSLATION" << endl;
- */
-}
-
-}
-
diff --git a/moses-chart-cmd/TranslationAnalysis.h b/moses-chart-cmd/TranslationAnalysis.h
deleted file mode 100644
index 7c8f1d545..000000000
--- a/moses-chart-cmd/TranslationAnalysis.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// $Id$
-
-/*
- * also see moses/SentenceStats
- */
-
-#ifndef _TRANSLATION_ANALYSIS_H_
-#define _TRANSLATION_ANALYSIS_H_
-
-#include <iostream>
-#include "moses/ChartHypothesis.h"
-
-namespace TranslationAnalysis
-{
-
-/**
- * print details about the translation represented in hypothesis to
- * os. Included information: phrase alignment, words dropped, scores
- */
-void PrintTranslationAnalysis(std::ostream &os, const Moses::Hypothesis* hypo);
-
-}
-
-#endif
diff --git a/moses-chart-cmd/mbr.cpp b/moses-chart-cmd/mbr.cpp
deleted file mode 100644
index 551378054..000000000
--- a/moses-chart-cmd/mbr.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <iomanip>
-#include <vector>
-#include <map>
-#include <stdlib.h>
-#include <math.h>
-#include <algorithm>
-#include <stdio.h>
-#include "moses/TrellisPathList.h"
-#include "moses/TrellisPath.h"
-#include "moses/StaticData.h"
-#include "moses/Util.h"
-
-#include "mbr.h"
-
-using namespace std ;
-using namespace Moses;
-
-
-/* Input :
- 1. a sorted n-best list, with duplicates filtered out in the following format
- 0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
-
- 2. a weight vector
- 3. bleu order ( default = 4)
- 4. scaling factor to weigh the weight vector (default = 1.0)
-
- Output :
- translations that minimise the Bayes Risk of the n-best list
-
-
-*/
-
-int BLEU_ORDER = 4;
-int SMOOTH = 1;
-int DEBUG = 0;
-float min_interval = 1e-4;
-void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int > & allngrams)
-{
- vector< const Factor* > ngram;
- for (int k = 0; k < BLEU_ORDER; k++) {
- for(int i =0; i < max((int)sentence.size()-k,0); i++) {
- for ( int j = i; j<= i+k; j++) {
- ngram.push_back(sentence[j]);
- }
- ++allngrams[ngram];
- ngram.clear();
- }
- }
-}
-
-float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats )
-{
- int comps_n = 2*BLEU_ORDER+1;
- vector<int> comps(comps_n);
- float logbleu = 0.0, brevity;
-
- int hyp_length = sents[hyp].size();
-
- for (int i =0; i<BLEU_ORDER; i++) {
- comps[2*i] = 0;
- comps[2*i+1] = max(hyp_length-i,0);
- }
-
- map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
- map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
-
- for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
- it != hyp_ngrams.end(); it++) {
- map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
- if(ref_it != ref_ngrams.end()) {
- comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
- }
- }
- comps[comps_n-1] = sents[ref].size();
-
- if (DEBUG) {
- for ( int i = 0; i < comps_n; i++)
- cerr << "Comp " << i << " : " << comps[i];
- }
-
- for (int i=0; i<BLEU_ORDER; i++) {
- if (comps[0] == 0)
- return 0.0;
- if ( i > 0 )
- logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
- else
- logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
- }
- logbleu /= BLEU_ORDER;
- brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
- if (brevity < 0.0)
- logbleu += brevity;
- return exp(logbleu);
-}
-
-vector<const Factor*> doMBR(const TrellisPathList& nBestList)
-{
-// cerr << "Sentence " << sent << " has " << sents.size() << " candidate translations" << endl;
- float marginal = 0;
-
- vector<float> joint_prob_vec;
- vector< vector<const Factor*> > translations;
- float joint_prob;
- vector< map < vector <const Factor *>, int > > ngram_stats;
-
- TrellisPathList::const_iterator iter;
- //TrellisPath* hyp = NULL;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const TrellisPath &path = **iter;
- joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore());
- marginal += joint_prob;
- joint_prob_vec.push_back(joint_prob);
- //Cache ngram counts
- map < vector < const Factor *>, int > counts;
- vector<const Factor*> translation;
- GetOutputFactors(path, translation);
-
- //TO DO
- extract_ngrams(translation,counts);
- ngram_stats.push_back(counts);
- translations.push_back(translation);
- }
-
- vector<float> mbr_loss;
- float bleu, weightedLoss;
- float weightedLossCumul = 0;
- float minMBRLoss = 1000000;
- int minMBRLossIdx = -1;
-
- /* Main MBR computation done here */
- for (size_t i = 0; i < nBestList.GetSize(); i++) {
- weightedLossCumul = 0;
- for (size_t j = 0; j < nBestList.GetSize(); j++) {
- if ( i != j) {
- bleu = calculate_score(translations, j, i,ngram_stats );
- weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
- weightedLossCumul += weightedLoss;
- if (weightedLossCumul > minMBRLoss)
- break;
- }
- }
- if (weightedLossCumul < minMBRLoss) {
- minMBRLoss = weightedLossCumul;
- minMBRLossIdx = i;
- }
- }
- /* Find sentence that minimises Bayes Risk under 1- BLEU loss */
- return translations[minMBRLossIdx];
-}
-
-void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation)
-{
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
- const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
- assert (outputFactorOrder.size() == 1);
-
- // print the surface factor of the translation
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const Phrase &phrase = edge.GetCurrTargetPhrase();
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
-
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
- translation.push_back(factor);
- }
- }
-}
-
diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
deleted file mode 100644
index 120301dbe..000000000
--- a/moses-cmd/IOWrapper.cpp
+++ /dev/null
@@ -1,679 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ***********************************************************************/
-
-// example file on how to use moses library
-
-#include <iostream>
-#include <stack>
-#include <boost/algorithm/string.hpp>
-
-#include "moses/TypeDef.h"
-#include "moses/Util.h"
-#include "moses/Hypothesis.h"
-#include "moses/WordsRange.h"
-#include "moses/TrellisPathList.h"
-#include "moses/StaticData.h"
-#include "moses/FeatureVector.h"
-#include "moses/InputFileStream.h"
-#include "moses/FF/StatefulFeatureFunction.h"
-#include "moses/FF/StatelessFeatureFunction.h"
-#include "util/exception.hh"
-
-#include "IOWrapper.h"
-
-using namespace std;
-using namespace Moses;
-
-namespace MosesCmd
-{
-
-IOWrapper::IOWrapper(
- const vector<FactorType> &inputFactorOrder
- , const vector<FactorType> &outputFactorOrder
- , const FactorMask &inputFactorUsed
- , size_t nBestSize
- , const string &nBestFilePath)
- :m_inputFactorOrder(inputFactorOrder)
- ,m_outputFactorOrder(outputFactorOrder)
- ,m_inputFactorUsed(inputFactorUsed)
- ,m_inputFile(NULL)
- ,m_inputStream(&std::cin)
- ,m_nBestStream(NULL)
- ,m_outputWordGraphStream(NULL)
- ,m_outputSearchGraphStream(NULL)
- ,m_detailedTranslationReportingStream(NULL)
- ,m_alignmentOutputStream(NULL)
-{
- Initialization(inputFactorOrder, outputFactorOrder
- , inputFactorUsed
- , nBestSize, nBestFilePath);
-}
-
-IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
- , const std::vector<FactorType> &outputFactorOrder
- , const FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &inputFilePath)
- :m_inputFactorOrder(inputFactorOrder)
- ,m_outputFactorOrder(outputFactorOrder)
- ,m_inputFactorUsed(inputFactorUsed)
- ,m_inputFilePath(inputFilePath)
- ,m_inputFile(new InputFileStream(inputFilePath))
- ,m_nBestStream(NULL)
- ,m_outputWordGraphStream(NULL)
- ,m_outputSearchGraphStream(NULL)
- ,m_detailedTranslationReportingStream(NULL)
- ,m_alignmentOutputStream(NULL)
-{
- Initialization(inputFactorOrder, outputFactorOrder
- , inputFactorUsed
- , nBestSize, nBestFilePath);
-
- m_inputStream = m_inputFile;
-}
-
-IOWrapper::~IOWrapper()
-{
- if (m_inputFile != NULL)
- delete m_inputFile;
- if (m_nBestStream != NULL && !m_surpressSingleBestOutput) {
- // outputting n-best to file, rather than stdout. need to close file and delete obj
- delete m_nBestStream;
- }
- if (m_outputWordGraphStream != NULL) {
- delete m_outputWordGraphStream;
- }
- if (m_outputSearchGraphStream != NULL) {
- delete m_outputSearchGraphStream;
- }
- delete m_detailedTranslationReportingStream;
- delete m_alignmentOutputStream;
-}
-
-void IOWrapper::Initialization(const std::vector<FactorType> &/*inputFactorOrder*/
- , const std::vector<FactorType> &/*outputFactorOrder*/
- , const FactorMask &/*inputFactorUsed*/
- , size_t nBestSize
- , const std::string &nBestFilePath)
-{
- const StaticData &staticData = StaticData::Instance();
-
- // n-best
- m_surpressSingleBestOutput = false;
-
- if (nBestSize > 0) {
- if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") {
- m_nBestStream = &std::cout;
- m_surpressSingleBestOutput = true;
- } else {
- std::ofstream *file = new std::ofstream;
- m_nBestStream = file;
- file->open(nBestFilePath.c_str());
- }
- }
-
- // wordgraph output
- if (staticData.GetOutputWordGraph()) {
- string fileName = staticData.GetParam("output-word-graph")[0];
- std::ofstream *file = new std::ofstream;
- m_outputWordGraphStream = file;
- file->open(fileName.c_str());
- }
-
-
- // search graph output
- if (staticData.GetOutputSearchGraph()) {
- string fileName;
- if (staticData.GetOutputSearchGraphExtended())
- fileName = staticData.GetParam("output-search-graph-extended")[0];
- else
- fileName = staticData.GetParam("output-search-graph")[0];
- std::ofstream *file = new std::ofstream;
- m_outputSearchGraphStream = file;
- file->open(fileName.c_str());
- }
-
- // detailed translation reporting
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- const std::string &path = staticData.GetDetailedTranslationReportingFilePath();
- m_detailedTranslationReportingStream = new std::ofstream(path.c_str());
- UTIL_THROW_IF(!m_detailedTranslationReportingStream->good(),
- util::FileOpenException,
- "File for output of detailed translation report could not be open");
- }
-
- // sentence alignment output
- if (! staticData.GetAlignmentOutputFile().empty()) {
- m_alignmentOutputStream = new ofstream(staticData.GetAlignmentOutputFile().c_str());
- UTIL_THROW_IF(!m_alignmentOutputStream->good(),
- util::FileOpenException,
- "File for output of word alignment could not be open");
- }
-
-}
-
-InputType*
-IOWrapper::
-GetInput(InputType* inputType)
-{
- if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
- if (long x = inputType->GetTranslationId()) {
- if (x>=m_translationId) m_translationId = x+1;
- } else inputType->SetTranslationId(m_translationId++);
-
- return inputType;
- } else {
- delete inputType;
- return NULL;
- }
-}
-
-std::map<size_t, const Factor*> GetPlaceholders(const Hypothesis &hypo, FactorType placeholderFactor)
-{
- const InputPath &inputPath = hypo.GetTranslationOption().GetInputPath();
- const Phrase &inputPhrase = inputPath.GetPhrase();
-
- std::map<size_t, const Factor*> ret;
-
- for (size_t sourcePos = 0; sourcePos < inputPhrase.GetSize(); ++sourcePos) {
- const Factor *factor = inputPhrase.GetFactor(sourcePos, placeholderFactor);
- if (factor) {
- std::set<size_t> targetPos = hypo.GetTranslationOption().GetTargetPhrase().GetAlignTerm().GetAlignmentsForSource(sourcePos);
- UTIL_THROW_IF2(targetPos.size() != 1,
- "Placeholder should be aligned to 1, and only 1, word");
- ret[*targetPos.begin()] = factor;
- }
- }
-
- return ret;
-}
-
-/***
- * print surface factor only for the given phrase
- */
-void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
- char reportSegmentation, bool reportAllFactors)
-{
- UTIL_THROW_IF2(outputFactorOrder.size() == 0,
- "Must specific at least 1 output factor");
- const TargetPhrase& phrase = edge.GetCurrTargetPhrase();
- bool markUnknown = StaticData::Instance().GetMarkUnknown();
- if (reportAllFactors == true) {
- out << phrase;
- } else {
- FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
-
- std::map<size_t, const Factor*> placeholders;
- if (placeholderFactor != NOT_FOUND) {
- // creates map of target position -> factor for placeholders
- placeholders = GetPlaceholders(edge, placeholderFactor);
- }
-
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
-
- if (placeholders.size()) {
- // do placeholders
- std::map<size_t, const Factor*>::const_iterator iter = placeholders.find(pos);
- if (iter != placeholders.end()) {
- factor = iter->second;
- }
- }
-
- UTIL_THROW_IF2(factor == NULL,
- "No factor 0 at position " << pos);
-
- //preface surface form with UNK if marking unknowns
- const Word &word = phrase.GetWord(pos);
- if(markUnknown && word.IsOOV()) {
- out << "UNK" << *factor;
- } else {
- out << *factor;
- }
-
- for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
- UTIL_THROW_IF2(factor == NULL,
- "No factor " << i << " at position " << pos);
-
- out << "|" << *factor;
- }
- out << " ";
- }
- }
-
- // trace ("report segmentation") option "-t" / "-tt"
- if (reportSegmentation > 0 && phrase.GetSize() > 0) {
- const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
- const int sourceStart = sourceRange.GetStartPos();
- const int sourceEnd = sourceRange.GetEndPos();
- out << "|" << sourceStart << "-" << sourceEnd; // enriched "-tt"
- if (reportSegmentation == 2) {
- out << ",wa=";
- const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
- OutputAlignment(out, ai, 0, 0);
- out << ",total=";
- out << edge.GetScore() - edge.GetPrevHypo()->GetScore();
- out << ",";
- ScoreComponentCollection scoreBreakdown(edge.GetScoreBreakdown());
- scoreBreakdown.MinusEquals(edge.GetPrevHypo()->GetScoreBreakdown());
- OutputAllFeatureScores(scoreBreakdown, out);
- }
- out << "| ";
- }
-}
-
-void OutputPassthroughInformation(std::string& passthrough, const Hypothesis *hypo)
-{
- passthrough = hypo->GetManager().GetSource().GetPassthroughInformation();
-}
-
-void OutputPassthroughInformation(std::ostream &out, const Hypothesis *hypo)
-{
- std::string passthrough;
- passthrough = hypo->GetManager().GetSource().GetPassthroughInformation();
- out << passthrough;
-}
-
-void OutputBestSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder,
- char reportSegmentation, bool reportAllFactors)
-{
- if (hypo != NULL) {
- // recursively retrace this best path through the lattice, starting from the end of the hypothesis sentence
- OutputBestSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors);
- OutputSurface(out, *hypo, outputFactorOrder, reportSegmentation, reportAllFactors);
- }
-}
-
-void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset)
-{
- typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
- AlignVec alignments = ai.GetSortedAlignments();
-
- AlignVec::const_iterator it;
- for (it = alignments.begin(); it != alignments.end(); ++it) {
- const std::pair<size_t,size_t> &alignment = **it;
- out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
- }
-
-}
-
-void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
-{
- size_t targetOffset = 0;
-
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const TargetPhrase &tp = edge.GetCurrTargetPhrase();
- size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
-
- OutputAlignment(out, tp.GetAlignTerm(), sourceOffset, targetOffset);
-
- targetOffset += tp.GetSize();
- }
- out << std::endl;
-}
-
-void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
-{
- std::vector<const Hypothesis *> edges;
- const Hypothesis *currentHypo = hypo;
- while (currentHypo) {
- edges.push_back(currentHypo);
- currentHypo = currentHypo->GetPrevHypo();
- }
-
- OutputAlignment(out, edges);
-
-}
-
-void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
-{
- ostringstream out;
- OutputAlignment(out, edges);
-
- collector->Write(lineNo,out.str());
-}
-
-void OutputAlignment(OutputCollector* collector, size_t lineNo , const Hypothesis *hypo)
-{
- if (collector) {
- std::vector<const Hypothesis *> edges;
- const Hypothesis *currentHypo = hypo;
- while (currentHypo) {
- edges.push_back(currentHypo);
- currentHypo = currentHypo->GetPrevHypo();
- }
-
- OutputAlignment(collector,lineNo, edges);
- }
-}
-
-void OutputAlignment(OutputCollector* collector, size_t lineNo , const TrellisPath &path)
-{
- if (collector) {
- OutputAlignment(collector,lineNo, path.GetEdges());
- }
-}
-
-void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/, char reportSegmentation, bool reportAllFactors, std::ostream &out)
-{
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
-
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- OutputSurface(out, edge, StaticData::Instance().GetOutputFactorOrder(), reportSegmentation, reportAllFactors);
- }
- out << endl;
-}
-
-void IOWrapper::Backtrack(const Hypothesis *hypo)
-{
-
- if (hypo->GetPrevHypo() != NULL) {
- VERBOSE(3,hypo->GetId() << " <= ");
- Backtrack(hypo->GetPrevHypo());
- }
-}
-
-void OutputBestHypo(const std::vector<Word>& mbrBestHypo, long /*translationId*/, char /*reportSegmentation*/, bool /*reportAllFactors*/, ostream& out)
-{
-
- for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
- const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
- UTIL_THROW_IF2(factor == NULL,
- "No factor 0 at position " << i);
- if (i>0) out << " " << *factor;
- else out << *factor;
- }
- out << endl;
-}
-
-
-void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
-{
- if (hypo->GetPrevHypo()) {
- OutputInput(map, hypo->GetPrevHypo());
- map[hypo->GetCurrSourceWordsRange().GetStartPos()] = &hypo->GetTranslationOption().GetInputPath().GetPhrase();
- }
-}
-
-void OutputInput(std::ostream& os, const Hypothesis* hypo)
-{
- size_t len = hypo->GetInput().GetSize();
- std::vector<const Phrase*> inp_phrases(len, 0);
- OutputInput(inp_phrases, hypo);
- for (size_t i=0; i<len; ++i)
- if (inp_phrases[i]) os << *inp_phrases[i];
-}
-
-void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long /*translationId*/, char reportSegmentation, bool reportAllFactors)
-{
- if (hypo != NULL) {
- VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
- VERBOSE(3,"Best path: ");
- if (StaticData::Instance().IsPassthroughEnabled()) {
- OutputPassthroughInformation(cout, hypo);
- }
- Backtrack(hypo);
- VERBOSE(3,"0" << std::endl);
- if (!m_surpressSingleBestOutput) {
- if (StaticData::Instance().GetOutputHypoScore()) {
- cout << hypo->GetTotalScore() << " ";
- }
-
- if (StaticData::Instance().IsPathRecoveryEnabled()) {
- OutputInput(cout, hypo);
- cout << "||| ";
- }
- OutputBestSurface(cout, hypo, m_outputFactorOrder, reportSegmentation, reportAllFactors);
- cout << endl;
- }
- } else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
- if (!m_surpressSingleBestOutput) {
- cout << endl;
- }
- }
-}
-
-void OutputNBest(std::ostream& out
- , const Moses::TrellisPathList &nBestList
- , const std::vector<Moses::FactorType>& outputFactorOrder
- , long translationId
- , char reportSegmentation)
-{
- const StaticData &staticData = StaticData::Instance();
- bool reportAllFactors = staticData.GetReportAllFactorsNBest();
- bool includeSegmentation = staticData.NBestIncludesSegmentation();
- bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
-
- TrellisPathList::const_iterator iter;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const TrellisPath &path = **iter;
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
-
- // print the surface factor of the translation
- out << translationId << " ||| ";
- if (staticData.IsPassthroughInNBestEnabled()) {
- OutputPassthroughInformation(out, edges[edges.size() - 1]);
- }
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- OutputSurface(out, edge, outputFactorOrder, reportSegmentation, reportAllFactors);
- }
- out << " |||";
-
- // print scores with feature names
- OutputAllFeatureScores(path.GetScoreBreakdown(), out );
-
- // total
- out << " ||| " << path.GetTotalScore();
-
- //phrase-to-phrase segmentation
- if (includeSegmentation) {
- out << " |||";
- for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
- WordsRange targetRange = path.GetTargetWordsRange(edge);
- out << " " << sourceRange.GetStartPos();
- if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
- out << "-" << sourceRange.GetEndPos();
- }
- out<< "=" << targetRange.GetStartPos();
- if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
- out<< "-" << targetRange.GetEndPos();
- }
- }
- }
-
- if (includeWordAlignment) {
- out << " ||| ";
- for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
- WordsRange targetRange = path.GetTargetWordsRange(edge);
- const int sourceOffset = sourceRange.GetStartPos();
- const int targetOffset = targetRange.GetStartPos();
- const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
-
- OutputAlignment(out, ai, sourceOffset, targetOffset);
-
- }
- }
-
- if (StaticData::Instance().IsPathRecoveryEnabled()) {
- out << " ||| ";
- OutputInput(out, edges[0]);
- }
-
- out << endl;
- }
-
- out << std::flush;
-}
-
-void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out)
-{
- std::string lastName = "";
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for( size_t i=0; i<sff.size(); i++ ) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->GetScoreProducerDescription() != "BleuScoreFeature"
- && ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- for( size_t i=0; i<slf.size(); i++ ) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
-}
-
-void OutputFeatureScores( std::ostream& out
- , const ScoreComponentCollection &features
- , const FeatureFunction *ff
- , std::string &lastName )
-{
- const StaticData &staticData = StaticData::Instance();
- bool labeledOutput = staticData.IsLabeledNBestList();
-
- // regular features (not sparse)
- if (ff->GetNumScoreComponents() != 0) {
- if( labeledOutput && lastName != ff->GetScoreProducerDescription() ) {
- lastName = ff->GetScoreProducerDescription();
- out << " " << lastName << "=";
- }
- vector<float> scores = features.GetScoresForProducer( ff );
- for (size_t j = 0; j<scores.size(); ++j) {
- out << " " << scores[j];
- }
- }
-
- // sparse features
- const FVector scores = features.GetVectorForProducer( ff );
- for(FVector::FNVmap::const_iterator i = scores.cbegin(); i != scores.cend(); i++) {
- out << " " << i->first << "= " << i->second;
- }
-}
-
-void OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>& solutions,long translationId)
-{
- for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
- out << translationId;
- out << " |||";
- const vector<Word> mbrHypo = si->GetWords();
- for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
- const Factor *factor = mbrHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
- if (i>0) out << " " << *factor;
- else out << *factor;
- }
- out << " |||";
- out << " map: " << si->GetMapScore();
- out << " w: " << mbrHypo.size();
- const vector<float>& ngramScores = si->GetNgramScores();
- for (size_t i = 0; i < ngramScores.size(); ++i) {
- out << " " << ngramScores[i];
- }
- out << " ||| " << si->GetScore();
-
- out << endl;
- }
-}
-
-
-void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solutions,long translationId)
-{
- OutputLatticeMBRNBest(*m_nBestStream, solutions,translationId);
-}
-
-bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
-{
- if (source) delete source;
- switch(inputType) {
- case SentenceInput:
- source = ioWrapper.GetInput(new Sentence);
- break;
- case ConfusionNetworkInput:
- source = ioWrapper.GetInput(new ConfusionNet);
- break;
- case WordLatticeInput:
- source = ioWrapper.GetInput(new WordLattice);
- break;
- default:
- TRACE_ERR("Unknown input type: " << inputType << "\n");
- source = NULL;
- }
- return (source ? true : false);
-}
-
-
-
-IOWrapper *GetIOWrapper(const StaticData &staticData)
-{
- IOWrapper *ioWrapper;
- const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
- ,&outputFactorOrder = staticData.GetOutputFactorOrder();
- FactorMask inputFactorUsed(inputFactorOrder);
-
- // io
- if (staticData.GetParam("input-file").size() == 1) {
- VERBOSE(2,"IO from File" << endl);
- string filePath = staticData.GetParam("input-file")[0];
-
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath()
- , filePath);
- } else {
- VERBOSE(1,"IO from STDOUT/STDIN" << endl);
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath());
- }
- ioWrapper->ResetTranslationId();
-
- IFVERBOSE(1)
- PrintUserTime("Created input-output object");
-
- return ioWrapper;
-}
-
-}
-
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
deleted file mode 100644
index 7afb18948..000000000
--- a/moses-cmd/IOWrapper.h
+++ /dev/null
@@ -1,166 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#ifndef moses_cmd_IOWrapper_h
-#define moses_cmd_IOWrapper_h
-
-#include <cassert>
-#include <fstream>
-#include <ostream>
-#include <vector>
-
-#include "moses/TypeDef.h"
-#include "moses/Sentence.h"
-#include "moses/FactorTypeSet.h"
-#include "moses/FactorCollection.h"
-#include "moses/Hypothesis.h"
-#include "moses/OutputCollector.h"
-#include "moses/TrellisPathList.h"
-#include "moses/InputFileStream.h"
-#include "moses/InputType.h"
-#include "moses/WordLattice.h"
-#include "LatticeMBR.h"
-
-namespace Moses
-{
-class ScoreComponentCollection;
-class Hypothesis;
-class Factor;
-}
-
-namespace MosesCmd
-{
-
-/** Helper class that holds misc variables to write data out to command line.
- */
-class IOWrapper
-{
-protected:
- long m_translationId;
-
- const std::vector<Moses::FactorType> &m_inputFactorOrder;
- const std::vector<Moses::FactorType> &m_outputFactorOrder;
- const Moses::FactorMask &m_inputFactorUsed;
- std::string m_inputFilePath;
- Moses::InputFileStream *m_inputFile;
- std::istream *m_inputStream;
- std::ostream *m_nBestStream
- ,*m_outputWordGraphStream,*m_outputSearchGraphStream;
- std::ostream *m_detailedTranslationReportingStream;
- std::ofstream *m_alignmentOutputStream;
- bool m_surpressSingleBestOutput;
-
- void Initialization(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath);
-
-
-public:
- IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath);
-
- IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &infilePath);
- ~IOWrapper();
-
- Moses::InputType* GetInput(Moses::InputType *inputType);
-
- void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, char reportSegmentation, bool reportAllFactors);
- void OutputLatticeMBRNBestList(const std::vector<LatticeMBRSolution>& solutions,long translationId);
- void Backtrack(const Moses::Hypothesis *hypo);
-
- void ResetTranslationId() {
- m_translationId = 0;
- }
-
- std::ofstream *GetAlignmentOutputStream() {
- return m_alignmentOutputStream;
- }
-
- std::ostream &GetOutputWordGraphStream() {
- return *m_outputWordGraphStream;
- }
- std::ostream &GetOutputSearchGraphStream() {
- return *m_outputSearchGraphStream;
- }
-
- std::ostream &GetDetailedTranslationReportingStream() {
- assert (m_detailedTranslationReportingStream);
- return *m_detailedTranslationReportingStream;
- }
-};
-
-IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
-bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source);
-void OutputLanguageModelOrder(std::ostream &out, const Moses::Hypothesis *hypo, Moses::Manager &manager);
-void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder, char reportSegmentation, bool reportAllFactors);
-void OutputLatticeMBRNBest(std::ostream& out, const std::vector<LatticeMBRSolution>& solutions,long translationId);
-void OutputBestHypo(const std::vector<Moses::Word>& mbrBestHypo, long /*translationId*/,
- char reportSegmentation, bool reportAllFactors, std::ostream& out);
-void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,char reportSegmentation, bool reportAllFactors, std::ostream &out);
-void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
-void OutputPassthroughInformation(std::string& passthrough, const Moses::Hypothesis* hypo);
-void OutputPassthroughInformation(std::ostream& os, const Moses::Hypothesis* hypo);
-void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
-void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
-void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
-void OutputAlignment(std::ostream &out, const Moses::AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset);
-
-void OutputNBest(std::ostream& out
- , const Moses::TrellisPathList &nBestList
- , const std::vector<Moses::FactorType>& outputFactorOrder
- , long translationId
- , char reportSegmentation);
-void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out);
-void OutputFeatureScores( std::ostream& out
- , const Moses::ScoreComponentCollection &features
- , const Moses::FeatureFunction *ff
- , std::string &lastName );
-
-// creates a map of TARGET positions which should be replaced by word using placeholder
-std::map<size_t, const Moses::Factor*> GetPlaceholders(const Moses::Hypothesis &hypo, Moses::FactorType placeholderFactor);
-
-}
-
-#endif
diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile
index bddc10911..ee762823e 100644
--- a/moses-cmd/Jamfile
+++ b/moses-cmd/Jamfile
@@ -1,6 +1,6 @@
-alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ;
+alias deps : ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ;
exe moses : Main.cpp deps ;
exe lmbrgrid : LatticeMBRGrid.cpp deps ;
-
alias programs : moses lmbrgrid ;
+
diff --git a/moses-cmd/LatticeMBRGrid.cpp b/moses-cmd/LatticeMBRGrid.cpp
index 39d88f34d..9b2ee167c 100644
--- a/moses-cmd/LatticeMBRGrid.cpp
+++ b/moses-cmd/LatticeMBRGrid.cpp
@@ -46,8 +46,8 @@ POSSIBILITY OF SUCH DAMAGE.
#include <stdexcept>
#include <set>
-#include "IOWrapper.h"
-#include "LatticeMBR.h"
+#include "moses/IOWrapper.h"
+#include "moses/LatticeMBR.h"
#include "moses/Manager.h"
#include "moses/StaticData.h"
#include "util/exception.hh"
@@ -55,12 +55,11 @@ POSSIBILITY OF SUCH DAMAGE.
using namespace std;
using namespace Moses;
-using namespace MosesCmd;
//keys
enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale};
-namespace MosesCmd
+namespace Moses
{
class Grid
@@ -159,8 +158,8 @@ int main(int argc, char* argv[])
StaticData& staticData = const_cast<StaticData&>(StaticData::Instance());
staticData.SetUseLatticeMBR(true);
- IOWrapper* ioWrapper = GetIOWrapper(staticData);
+ IOWrapper* ioWrapper = new IOWrapper();
if (!ioWrapper) {
throw runtime_error("Failed to initialise IOWrapper");
}
@@ -178,11 +177,12 @@ int main(int argc, char* argv[])
const vector<float>& prune_grid = grid.getGrid(lmbr_prune);
const vector<float>& scale_grid = grid.getGrid(lmbr_scale);
- while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+ while(ioWrapper->ReadInput(staticData.GetInputType(),source)) {
++lineCount;
- Sentence sentence;
- Manager manager(lineCount, *source, staticData.GetSearchAlgorithm());
- manager.ProcessSentence();
+ source->SetTranslationId(lineCount);
+
+ Manager manager(*source, staticData.GetSearchAlgorithm());
+ manager.Decode();
TrellisPathList nBestList;
manager.CalcNBest(nBestSize, nBestList,true);
//grid search
@@ -200,7 +200,7 @@ int main(int argc, char* argv[])
staticData.SetMBRScale(scale);
cout << lineCount << " ||| " << p << " " << r << " " << prune << " " << scale << " ||| ";
vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
- OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
+ ioWrapper->OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
staticData.GetReportAllFactors(),cout);
}
}
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index c931ea3dc..03b3a5054 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -22,14 +22,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
/**
* Moses main, for single-threaded and multi-threaded.
**/
-
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/iostreams/device/file.hpp>
-#include <boost/iostreams/filter/bzip2.hpp>
-#include <boost/iostreams/filter/gzip.hpp>
-#include <boost/iostreams/filtering_stream.hpp>
-
#include <exception>
#include <fstream>
#include <sstream>
@@ -42,537 +34,39 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//#include <vld.h>
#endif
-#include "TranslationAnalysis.h"
-#include "IOWrapper.h"
-#include "mbr.h"
-
+#include "moses/IOWrapper.h"
#include "moses/Hypothesis.h"
#include "moses/Manager.h"
#include "moses/StaticData.h"
+#include "moses/TypeDef.h"
#include "moses/Util.h"
#include "moses/Timer.h"
-#include "moses/ThreadPool.h"
-#include "moses/OutputCollector.h"
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/TranslationTask.h"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
#endif
-using namespace std;
-using namespace Moses;
-using namespace MosesCmd;
-
-namespace MosesCmd
-{
-// output floats with five significant digits
-static const size_t PRECISION = 3;
-
-/** Enforce rounding */
-void fix(std::ostream& stream, size_t size)
-{
- stream.setf(std::ios::fixed);
- stream.precision(size);
-}
-
-/** Translates a sentence.
- * - calls the search (Manager)
- * - applies the decision rule
- * - outputs best translation and additional reporting
- **/
-class TranslationTask : public Task
-{
-
-public:
-
- TranslationTask(size_t lineNumber,
- InputType* source, OutputCollector* outputCollector, OutputCollector* nbestCollector,
- OutputCollector* latticeSamplesCollector,
- OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
- OutputCollector* detailedTranslationCollector,
- OutputCollector* alignmentInfoCollector,
- OutputCollector* unknownsCollector,
- bool outputSearchGraphSLF,
- bool outputSearchGraphHypergraph) :
- m_source(source), m_lineNumber(lineNumber),
- m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
- m_latticeSamplesCollector(latticeSamplesCollector),
- m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
- m_detailedTranslationCollector(detailedTranslationCollector),
- m_alignmentInfoCollector(alignmentInfoCollector),
- m_unknownsCollector(unknownsCollector),
- m_outputSearchGraphSLF(outputSearchGraphSLF),
- m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
-
- /** Translate one sentence
- * gets called by main function implemented at end of this source file */
- void Run() {
- // shorthand for "global data"
- const StaticData &staticData = StaticData::Instance();
-
- // input sentence
- Sentence sentence;
-
- // report wall time spent on translation
- Timer translationTime;
- translationTime.start();
-
- // report thread number
-#if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
- TRACE_ERR("Translating line " << m_lineNumber << " in thread id " << pthread_self() << std::endl);
-#endif
-
-
- // execute the translation
- // note: this executes the search, resulting in a search graph
- // we still need to apply the decision rule (MAP, MBR, ...)
- Timer initTime;
- initTime.start();
- Manager manager(m_lineNumber, *m_source,staticData.GetSearchAlgorithm());
- VERBOSE(1, "Line " << m_lineNumber << ": Initialize search took " << initTime << " seconds total" << endl);
- manager.ProcessSentence();
-
- // we are done with search, let's look what we got
- Timer additionalReportingTime;
- additionalReportingTime.start();
-
- // output word graph
- if (m_wordGraphCollector) {
- ostringstream out;
- fix(out,PRECISION);
- manager.GetWordGraph(m_lineNumber, out);
- m_wordGraphCollector->Write(m_lineNumber, out.str());
- }
-
- // output search graph
- if (m_searchGraphCollector) {
- ostringstream out;
- fix(out,PRECISION);
- manager.OutputSearchGraph(m_lineNumber, out);
- m_searchGraphCollector->Write(m_lineNumber, out.str());
-
-#ifdef HAVE_PROTOBUF
- if (staticData.GetOutputSearchGraphPB()) {
- ostringstream sfn;
- sfn << staticData.GetParam("output-search-graph-pb")[0] << '/' << m_lineNumber << ".pb" << ends;
- string fn = sfn.str();
- VERBOSE(2, "Writing search graph to " << fn << endl);
- fstream output(fn.c_str(), ios::trunc | ios::binary | ios::out);
- manager.SerializeSearchGraphPB(m_lineNumber, output);
- }
+#ifdef PT_UG
+#include <boost/foreach.hpp>
+#include "moses/TranslationModel/UG/mmsapt.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
#endif
- }
-
- // Output search graph in HTK standard lattice format (SLF)
- if (m_outputSearchGraphSLF) {
- stringstream fileName;
- fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
- std::ofstream *file = new std::ofstream;
- file->open(fileName.str().c_str());
- if (file->is_open() && file->good()) {
- ostringstream out;
- fix(out,PRECISION);
- manager.OutputSearchGraphAsSLF(m_lineNumber, out);
- *file << out.str();
- file -> flush();
- } else {
- TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
- }
- delete file;
- }
-
- // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
- if (m_outputSearchGraphHypergraph) {
-
- vector<string> hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph");
-
- bool appendSuffix;
- if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
- appendSuffix = true;
- } else {
- appendSuffix = false;
- }
-
- string compression;
- if (hypergraphParameters.size() > 1) {
- compression = hypergraphParameters[1];
- } else {
- compression = "txt";
- }
-
- string hypergraphDir;
- if ( hypergraphParameters.size() > 2 ) {
- hypergraphDir = hypergraphParameters[2];
- } else {
- string nbestFile = staticData.GetNBestFilePath();
- if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
- boost::filesystem::path nbestPath(nbestFile);
-
- // In the Boost filesystem API version 2,
- // which was the default prior to Boost 1.46,
- // the filename() method returned a string.
- //
- // In the Boost filesystem API version 3,
- // which is the default starting with Boost 1.46,
- // the filename() method returns a path object.
- //
- // To get a string from the path object,
- // the native() method must be called.
- // hypergraphDir = nbestPath.parent_path().filename()
- //#if BOOST_VERSION >= 104600
- // .native()
- //#endif
- //;
-
- // Hopefully the following compiles under all versions of Boost.
- //
- // If this line gives you compile errors,
- // contact Lane Schwartz on the Moses mailing list
- hypergraphDir = nbestPath.parent_path().string();
-
- } else {
- stringstream hypergraphDirName;
- hypergraphDirName << boost::filesystem::current_path().string() << "/hypergraph";
- hypergraphDir = hypergraphDirName.str();
- }
- }
-
- if ( ! boost::filesystem::exists(hypergraphDir) ) {
- boost::filesystem::create_directory(hypergraphDir);
- }
-
- if ( ! boost::filesystem::exists(hypergraphDir) ) {
- TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl);
- } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) {
- TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
- } else {
- stringstream fileName;
- fileName << hypergraphDir << "/" << m_lineNumber;
- if ( appendSuffix ) {
- fileName << "." << compression;
- }
- boost::iostreams::filtering_ostream *file
- = new boost::iostreams::filtering_ostream;
-
- if ( compression == "gz" ) {
- file->push( boost::iostreams::gzip_compressor() );
- } else if ( compression == "bz2" ) {
- file->push( boost::iostreams::bzip2_compressor() );
- } else if ( compression != "txt" ) {
- TRACE_ERR("Unrecognized hypergraph compression format ("
- << compression
- << ") - using uncompressed plain txt" << std::endl);
- compression = "txt";
- }
-
- file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
-
- if (file->is_complete() && file->good()) {
- fix(*file,PRECISION);
- manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
- file -> flush();
- } else {
- TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber
- << " because the output file " << fileName.str()
- << " is not open or not ready for writing"
- << std::endl);
- }
- file -> pop();
- delete file;
- }
- }
- additionalReportingTime.stop();
-
- // apply decision rule and output best translation(s)
- if (m_outputCollector) {
- ostringstream out;
- ostringstream debug;
- fix(debug,PRECISION);
-
- // all derivations - send them to debug stream
- if (staticData.PrintAllDerivations()) {
- additionalReportingTime.start();
- manager.PrintAllDerivations(m_lineNumber, debug);
- additionalReportingTime.stop();
- }
-
- Timer decisionRuleTime;
- decisionRuleTime.start();
-
- // MAP decoding: best hypothesis
- const Hypothesis* bestHypo = NULL;
- if (!staticData.UseMBR()) {
- bestHypo = manager.GetBestHypothesis();
- if (bestHypo) {
- if (staticData.GetOutputHypoScore()) {
- out << bestHypo->GetTotalScore() << ' ';
- }
- if (staticData.IsPathRecoveryEnabled()) {
- OutputInput(out, bestHypo);
- out << "||| ";
- }
- if (staticData.IsIDEnabled()) {
- out << m_source->GetTranslationId() << " ";
- }
- if (staticData.IsPassthroughEnabled()) {
- OutputPassthroughInformation(out, bestHypo);
- }
-
- if (staticData.GetReportSegmentation() == 2) {
- manager.GetOutputLanguageModelOrder(out, bestHypo);
- }
- OutputBestSurface(
- out,
- bestHypo,
- staticData.GetOutputFactorOrder(),
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors());
- if (staticData.PrintAlignmentInfo()) {
- out << "||| ";
- OutputAlignment(out, bestHypo);
- }
-
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
- IFVERBOSE(1) {
- debug << "BEST TRANSLATION: " << *bestHypo << endl;
- }
- } else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
- }
-
- out << endl;
- }
-
- // MBR decoding (n-best MBR, lattice MBR, consensus)
- else {
- // we first need the n-best translations
- size_t nBestSize = staticData.GetMBRSize();
- if (nBestSize <= 0) {
- cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl;
- exit(1);
- }
- TrellisPathList nBestList;
- manager.CalcNBest(nBestSize, nBestList,true);
- VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl);
- IFVERBOSE(2) {
- PrintUserTime("calculated n-best list for (L)MBR decoding");
- }
-
- // lattice MBR
- if (staticData.UseLatticeMBR()) {
- if (m_nbestCollector) {
- //lattice mbr nbest
- vector<LatticeMBRSolution> solutions;
- size_t n = min(nBestSize, staticData.GetNBestSize());
- getLatticeMBRNBest(manager,nBestList,solutions,n);
- ostringstream out;
- OutputLatticeMBRNBest(out, solutions,m_lineNumber);
- m_nbestCollector->Write(m_lineNumber, out.str());
- } else {
- //Lattice MBR decoding
- vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
- OutputBestHypo(mbrBestHypo, m_lineNumber, staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- IFVERBOSE(2) {
- PrintUserTime("finished Lattice MBR decoding");
- }
- }
- }
-
- // consensus decoding
- else if (staticData.UseConsensusDecoding()) {
- const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
- OutputBestHypo(conBestHypo, m_lineNumber,
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, conBestHypo);
- IFVERBOSE(2) {
- PrintUserTime("finished Consensus decoding");
- }
- }
-
- // n-best MBR decoding
- else {
- const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
- OutputBestHypo(mbrBestHypo, m_lineNumber,
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, mbrBestHypo);
- IFVERBOSE(2) {
- PrintUserTime("finished MBR decoding");
- }
- }
- }
-
- // report best translation to output collector
- m_outputCollector->Write(m_lineNumber,out.str(),debug.str());
- decisionRuleTime.stop();
- VERBOSE(1, "Line " << m_lineNumber << ": Decision rule took " << decisionRuleTime << " seconds total" << endl);
- }
-
- additionalReportingTime.start();
-
- // output n-best list
- if (m_nbestCollector && !staticData.UseLatticeMBR()) {
- TrellisPathList nBestList;
- ostringstream out;
- manager.CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest());
- OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
- m_nbestCollector->Write(m_lineNumber, out.str());
- }
-
- //lattice samples
- if (m_latticeSamplesCollector) {
- TrellisPathList latticeSamples;
- ostringstream out;
- manager.CalcLatticeSamples(staticData.GetLatticeSamplesSize(), latticeSamples);
- OutputNBest(out,latticeSamples, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
- m_latticeSamplesCollector->Write(m_lineNumber, out.str());
- }
-
- // detailed translation reporting
- if (m_detailedTranslationCollector) {
- ostringstream out;
- fix(out,PRECISION);
- TranslationAnalysis::PrintTranslationAnalysis(out, manager.GetBestHypothesis());
- m_detailedTranslationCollector->Write(m_lineNumber,out.str());
- }
-
- //list of unknown words
- if (m_unknownsCollector) {
- const vector<const Phrase*>& unknowns = manager.getSntTranslationOptions()->GetUnknownSources();
- ostringstream out;
- for (size_t i = 0; i < unknowns.size(); ++i) {
- out << *(unknowns[i]);
- }
- out << endl;
- m_unknownsCollector->Write(m_lineNumber, out.str());
- }
-
- // report additional statistics
- manager.CalcDecoderStatistics();
- VERBOSE(1, "Line " << m_lineNumber << ": Additional reporting took " << additionalReportingTime << " seconds total" << endl);
- VERBOSE(1, "Line " << m_lineNumber << ": Translation took " << translationTime << " seconds total" << endl);
- IFVERBOSE(2) {
- PrintUserTime("Sentence Decoding Time:");
- }
- }
-
- ~TranslationTask() {
- delete m_source;
- }
-
-private:
- InputType* m_source;
- size_t m_lineNumber;
- OutputCollector* m_outputCollector;
- OutputCollector* m_nbestCollector;
- OutputCollector* m_latticeSamplesCollector;
- OutputCollector* m_wordGraphCollector;
- OutputCollector* m_searchGraphCollector;
- OutputCollector* m_detailedTranslationCollector;
- OutputCollector* m_alignmentInfoCollector;
- OutputCollector* m_unknownsCollector;
- bool m_outputSearchGraphSLF;
- bool m_outputSearchGraphHypergraph;
- std::ofstream *m_alignmentStream;
-
-
-};
-
-static void PrintFeatureWeight(const FeatureFunction* ff)
-{
- cout << ff->GetScoreProducerDescription() << "=";
- size_t numScoreComps = ff->GetNumScoreComponents();
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- for (size_t i = 0; i < numScoreComps; ++i) {
- cout << " " << values[i];
- }
- cout << endl;
-}
-
-static void ShowWeights()
-{
- //TODO: Find a way of ensuring this order is synced with the nbest
- fix(cout,6);
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
-
- for (size_t i = 0; i < sff.size(); ++i) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- } else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- } else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
-}
+using namespace std;
+using namespace Moses;
-size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
+namespace Moses
{
- size_t numScoreComps = ff->GetNumScoreComponents();
- if (numScoreComps != 0) {
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- if (numScoreComps > 1) {
- for (size_t i = 0; i < numScoreComps; ++i) {
- outputSearchGraphStream << ff->GetScoreProducerDescription()
- << i
- << "=" << values[i] << endl;
- }
- } else {
- outputSearchGraphStream << ff->GetScoreProducerDescription()
- << "=" << values[0] << endl;
- }
- return index+numScoreComps;
- } else {
- UTIL_THROW2("Sparse features are not yet supported when outputting hypergraph format");
- }
-}
void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
-
- const vector<const StatelessFeatureFunction*>& slf =StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- size_t featureIndex = 1;
- for (size_t i = 0; i < sff.size(); ++i) {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- /*
- if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
- slf[i]->GetScoreProducerWeightShortName() != "tm" &&
- slf[i]->GetScoreProducerWeightShortName() != "I" &&
- slf[i]->GetScoreProducerWeightShortName() != "g")
- */
- {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
- }
- }
- const vector<PhraseDictionary*>& pds = PhraseDictionary::GetColl();
- for( size_t i=0; i<pds.size(); i++ ) {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
- }
- const vector<GenerationDictionary*>& gds = GenerationDictionary::GetColl();
- for( size_t i=0; i<gds.size(); i++ ) {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
- }
-
+ StaticData::Instance().GetAllWeights().Save(outputSearchGraphStream);
}
@@ -586,7 +80,7 @@ int main(int argc, char** argv)
#ifdef HAVE_PROTOBUF
GOOGLE_PROTOBUF_VERIFY_VERSION;
#endif
-
+
// echo command line, if verbose
IFVERBOSE(1) {
TRACE_ERR("command: ");
@@ -595,8 +89,8 @@ int main(int argc, char** argv)
}
// set number of significant decimals in output
- fix(cout,PRECISION);
- fix(cerr,PRECISION);
+ FixPrecision(cout);
+ FixPrecision(cerr);
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
@@ -605,15 +99,13 @@ int main(int argc, char** argv)
exit(1);
}
- std::cerr <<"Before StaticData::LoadDataStatic" << std::endl;
+
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(&params, argv[0])) {
exit(1);
}
- std::cerr <<"After StaticData::LoadDataStatic" << std::endl;
- std::cerr <<"Before ShowWeights" << std::endl;
// setting "-show-weights" -> just dump out weights and exit
if (params.isParamSpecified("show-weights")) {
ShowWeights();
@@ -628,8 +120,12 @@ int main(int argc, char** argv)
srand(time(NULL));
// set up read/writing class
- IOWrapper* ioWrapper = GetIOWrapper(staticData);
- if (!ioWrapper) {
+ IFVERBOSE(1) {
+ PrintUserTime("Created input-output object");
+ }
+
+ IOWrapper* ioWrapper = new IOWrapper();
+ if (ioWrapper == NULL) {
cerr << "Error; Failed to create IO object" << endl;
exit(1);
}
@@ -641,114 +137,6 @@ int main(int argc, char** argv)
TRACE_ERR(weights);
TRACE_ERR("\n");
}
- if (staticData.GetOutputSearchGraphHypergraph()) {
- ofstream* weightsOut = new std::ofstream;
- stringstream weightsFilename;
- if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) {
- weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3];
- } else {
- string nbestFile = staticData.GetNBestFilePath();
- if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
- boost::filesystem::path nbestPath(nbestFile);
- weightsFilename << nbestPath.parent_path().filename() << "/weights";
- } else {
- weightsFilename << boost::filesystem::current_path().string() << "/hypergraph/weights";
- }
- }
- boost::filesystem::path weightsFilePath(weightsFilename.str());
- if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) {
- boost::filesystem::create_directory(weightsFilePath.parent_path());
- }
- TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
- weightsOut->open(weightsFilename.str().c_str());
- OutputFeatureWeightsForHypergraph(*weightsOut);
- weightsOut->flush();
- weightsOut->close();
- delete weightsOut;
- }
-
-
- // initialize output streams
- // note: we can't just write to STDOUT or files
- // because multithreading may return sentences in shuffled order
- auto_ptr<OutputCollector> outputCollector; // for translations
- auto_ptr<OutputCollector> nbestCollector; // for n-best lists
- auto_ptr<OutputCollector> latticeSamplesCollector; //for lattice samples
- auto_ptr<ofstream> nbestOut;
- auto_ptr<ofstream> latticeSamplesOut;
- size_t nbestSize = staticData.GetNBestSize();
- string nbestFile = staticData.GetNBestFilePath();
- bool output1best = true;
- if (nbestSize) {
- if (nbestFile == "-" || nbestFile == "/dev/stdout") {
- // nbest to stdout, no 1-best
- nbestCollector.reset(new OutputCollector());
- output1best = false;
- } else {
- // nbest to file, 1-best to stdout
- nbestOut.reset(new ofstream(nbestFile.c_str()));
- if (!nbestOut->good()) {
- TRACE_ERR("ERROR: Failed to open " << nbestFile << " for nbest lists" << endl);
- exit(1);
- }
- nbestCollector.reset(new OutputCollector(nbestOut.get()));
- }
- }
- size_t latticeSamplesSize = staticData.GetLatticeSamplesSize();
- string latticeSamplesFile = staticData.GetLatticeSamplesFilePath();
- if (latticeSamplesSize) {
- if (latticeSamplesFile == "-" || latticeSamplesFile == "/dev/stdout") {
- latticeSamplesCollector.reset(new OutputCollector());
- output1best = false;
- } else {
- latticeSamplesOut.reset(new ofstream(latticeSamplesFile.c_str()));
- if (!latticeSamplesOut->good()) {
- TRACE_ERR("ERROR: Failed to open " << latticeSamplesFile << " for lattice samples" << endl);
- exit(1);
- }
- latticeSamplesCollector.reset(new OutputCollector(latticeSamplesOut.get()));
- }
- }
- if (output1best) {
- outputCollector.reset(new OutputCollector());
- }
-
- // initialize stream for word graph (aka: output lattice)
- auto_ptr<OutputCollector> wordGraphCollector;
- if (staticData.GetOutputWordGraph()) {
- wordGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputWordGraphStream())));
- }
-
- // initialize stream for search graph
- // note: this is essentially the same as above, but in a different format
- auto_ptr<OutputCollector> searchGraphCollector;
- if (staticData.GetOutputSearchGraph()) {
- searchGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputSearchGraphStream())));
- }
-
- // initialize stram for details about the decoder run
- auto_ptr<OutputCollector> detailedTranslationCollector;
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- detailedTranslationCollector.reset(new OutputCollector(&(ioWrapper->GetDetailedTranslationReportingStream())));
- }
-
- // initialize stram for word alignment between input and output
- auto_ptr<OutputCollector> alignmentInfoCollector;
- if (!staticData.GetAlignmentOutputFile().empty()) {
- alignmentInfoCollector.reset(new OutputCollector(ioWrapper->GetAlignmentOutputStream()));
- }
-
- //initialise stream for unknown (oov) words
- auto_ptr<OutputCollector> unknownsCollector;
- auto_ptr<ofstream> unknownsStream;
- if (!staticData.GetOutputUnknownsFile().empty()) {
- unknownsStream.reset(new ofstream(staticData.GetOutputUnknownsFile().c_str()));
- if (!unknownsStream->good()) {
- TRACE_ERR("Unable to open " << staticData.GetOutputUnknownsFile() << " for unknowns");
- exit(1);
- }
- unknownsCollector.reset(new OutputCollector(unknownsStream.get()));
- }
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
@@ -757,24 +145,51 @@ int main(int argc, char** argv)
// main loop over set of input sentences
InputType* source = NULL;
size_t lineCount = staticData.GetStartTranslationId();
- while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+ while(ioWrapper->ReadInput(staticData.GetInputType(),source)) {
+ source->SetTranslationId(lineCount);
IFVERBOSE(1) {
ResetUserTime();
}
+
+ FeatureFunction::CallChangeSource(source);
+
// set up task of translating one sentence
- TranslationTask* task =
- new TranslationTask(lineCount,source, outputCollector.get(),
- nbestCollector.get(),
- latticeSamplesCollector.get(),
- wordGraphCollector.get(),
- searchGraphCollector.get(),
- detailedTranslationCollector.get(),
- alignmentInfoCollector.get(),
- unknownsCollector.get(),
- staticData.GetOutputSearchGraphSLF(),
- staticData.GetOutputSearchGraphHypergraph());
+ TranslationTask* task;
+ if (staticData.IsChart()) {
+ // scfg
+ task = new TranslationTask(source, *ioWrapper, 2);
+ }
+ else {
+ // pb
+ task = new TranslationTask(source, *ioWrapper, 1);
+ }
+
// execute task
#ifdef WITH_THREADS
+#ifdef PT_UG
+ bool spe = params.isParamSpecified("spe-src");
+ if (spe) {
+ // simulated post-editing: always run single-threaded!
+ task->Run();
+ delete task;
+ string src,trg,aln;
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_src,src), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_trg,trg), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_aln,aln), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ BOOST_FOREACH (PhraseDictionary* pd, PhraseDictionary::GetColl())
+ {
+ Mmsapt* sapt = dynamic_cast<Mmsapt*>(pd);
+ if (sapt) sapt->add(src,trg,aln);
+ VERBOSE(1,"[" << HERE << " added src] " << src << endl);
+ VERBOSE(1,"[" << HERE << " added trg] " << trg << endl);
+ VERBOSE(1,"[" << HERE << " added aln] " << aln << endl);
+ }
+ }
+ else
+#endif
pool.Submit(task);
#else
task->Run();
diff --git a/moses-cmd/Main.h b/moses-cmd/Main.h
index 362c1f245..49fee0219 100644
--- a/moses-cmd/Main.h
+++ b/moses-cmd/Main.h
@@ -1,3 +1,4 @@
+#pragma once
// $Id$
/***********************************************************************
@@ -32,12 +33,10 @@ POSSIBILITY OF SUCH DAMAGE.
// example file on how to use moses library
-#ifndef moses_cmd_Main_h
-#define moses_cmd_Main_h
#include "moses/StaticData.h"
class IOWrapper;
int main(int argc, char* argv[]);
-#endif
+
diff --git a/moses/AlignmentInfo.cpp b/moses/AlignmentInfo.cpp
index ed317a764..b059a9ffd 100644
--- a/moses/AlignmentInfo.cpp
+++ b/moses/AlignmentInfo.cpp
@@ -25,13 +25,22 @@
namespace Moses
{
+
AlignmentInfo::AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs)
: m_collection(pairs)
{
- BuildNonTermIndexMap();
+ BuildNonTermIndexMaps();
+}
+
+AlignmentInfo::AlignmentInfo(const std::vector<unsigned char> &aln)
+{
+ assert(aln.size()%2==0);
+ for (size_t i = 0; i < aln.size(); i+= 2)
+ m_collection.insert(std::make_pair(size_t(aln[i]),size_t(aln[i+1])));
+ BuildNonTermIndexMaps();
}
-void AlignmentInfo::BuildNonTermIndexMap()
+void AlignmentInfo::BuildNonTermIndexMaps()
{
if (m_collection.empty()) {
return;
@@ -44,14 +53,17 @@ void AlignmentInfo::BuildNonTermIndexMap()
}
}
m_nonTermIndexMap.resize(maxIndex+1, NOT_FOUND);
+ m_nonTermIndexMap2.resize(maxIndex+1, NOT_FOUND);
size_t i = 0;
for (p = begin(); p != end(); ++p) {
if (m_nonTermIndexMap[p->second] != NOT_FOUND) {
// 1-to-many. Definitely a set of terminals. Don't bother storing 1-to-1 index map
m_nonTermIndexMap.clear();
+ m_nonTermIndexMap2.clear();
return;
}
m_nonTermIndexMap[p->second] = i++;
+ m_nonTermIndexMap2[p->second] = p->first;
}
}
diff --git a/moses/AlignmentInfo.h b/moses/AlignmentInfo.h
index 4e9647e3a..895dde8a1 100644
--- a/moses/AlignmentInfo.h
+++ b/moses/AlignmentInfo.h
@@ -65,6 +65,12 @@ public:
return m_nonTermIndexMap;
}
+ /** Like GetNonTermIndexMap but the return value is the symbol index (i.e.
+ * the index counting both terminals and non-terminals) */
+ const NonTermIndexMap &GetNonTermIndexMap2() const {
+ return m_nonTermIndexMap2;
+ }
+
const CollType &GetAlignments() const {
return m_collection;
}
@@ -88,11 +94,12 @@ public:
private:
//! AlignmentInfo objects should only be created by an AlignmentInfoCollection
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs);
-
- void BuildNonTermIndexMap();
+ explicit AlignmentInfo(const std::vector<unsigned char> &aln);
+ void BuildNonTermIndexMaps();
CollType m_collection;
NonTermIndexMap m_nonTermIndexMap;
+ NonTermIndexMap m_nonTermIndexMap2;
};
/** Define an arbitrary strict weak ordering between AlignmentInfo objects
diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp
index ef6e62eb3..0a54226cd 100644
--- a/moses/AlignmentInfoCollection.cpp
+++ b/moses/AlignmentInfoCollection.cpp
@@ -38,23 +38,23 @@ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
return *m_emptyAlignmentInfo;
}
-const AlignmentInfo *AlignmentInfoCollection::Add(
- const std::set<std::pair<size_t,size_t> > &pairs)
+AlignmentInfo const *
+AlignmentInfoCollection::
+Add(AlignmentInfo const& ainfo)
{
- AlignmentInfo pairsAlignmentInfo(pairs);
#ifdef WITH_THREADS
{
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
- AlignmentInfoSet::const_iterator i = m_collection.find(pairsAlignmentInfo);
+ AlignmentInfoSet::const_iterator i = m_collection.find(ainfo);
if (i != m_collection.end())
return &*i;
}
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
- std::pair<AlignmentInfoSet::iterator, bool> ret =
- m_collection.insert(pairsAlignmentInfo);
+ std::pair<AlignmentInfoSet::iterator, bool> ret = m_collection.insert(ainfo);
return &(*ret.first);
}
+
}
diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h
index 37d717b0f..1db0a2268 100644
--- a/moses/AlignmentInfoCollection.h
+++ b/moses/AlignmentInfoCollection.h
@@ -46,7 +46,16 @@ public:
* contains such an object then returns a pointer to it; otherwise a new
* one is inserted.
*/
- const AlignmentInfo *Add(const std::set<std::pair<size_t,size_t> > &);
+ private:
+ const AlignmentInfo* Add(AlignmentInfo const& ainfo);
+
+ public:
+ template<typename ALNREP>
+ AlignmentInfo const *
+ Add(ALNREP const & aln)
+ {
+ return this->Add(AlignmentInfo(aln));
+ }
//! Returns a pointer to an empty AlignmentInfo object.
const AlignmentInfo &GetEmptyAlignmentInfo() const;
@@ -54,6 +63,7 @@ public:
private:
typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
+
//! Only a single static variable should be created.
AlignmentInfoCollection();
~AlignmentInfoCollection();
diff --git a/moses/BaseManager.cpp b/moses/BaseManager.cpp
new file mode 100644
index 000000000..e41685344
--- /dev/null
+++ b/moses/BaseManager.cpp
@@ -0,0 +1,111 @@
+#include <vector>
+
+#include "StaticData.h"
+#include "BaseManager.h"
+#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/FF/StatefulFeatureFunction.h"
+
+using namespace std;
+
+namespace Moses
+{
+void BaseManager::OutputAllFeatureScores(const Moses::ScoreComponentCollection &features,
+ std::ostream &out) const
+{
+ std::string lastName = "";
+ const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for( size_t i=0; i<sff.size(); i++ ) {
+ const StatefulFeatureFunction *ff = sff[i];
+ if (ff->GetScoreProducerDescription() != "BleuScoreFeature"
+ && ff->IsTuneable()) {
+ OutputFeatureScores( out, features, ff, lastName );
+ }
+ }
+ const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ for( size_t i=0; i<slf.size(); i++ ) {
+ const StatelessFeatureFunction *ff = slf[i];
+ if (ff->IsTuneable()) {
+ OutputFeatureScores( out, features, ff, lastName );
+ }
+ }
+}
+
+void BaseManager::OutputFeatureScores( std::ostream& out,
+ const ScoreComponentCollection &features,
+ const FeatureFunction *ff,
+ std::string &lastName ) const
+{
+ const StaticData &staticData = StaticData::Instance();
+ bool labeledOutput = staticData.IsLabeledNBestList();
+
+ // regular features (not sparse)
+ if (ff->GetNumScoreComponents() != 0) {
+ if( labeledOutput && lastName != ff->GetScoreProducerDescription() ) {
+ lastName = ff->GetScoreProducerDescription();
+ out << " " << lastName << "=";
+ }
+ vector<float> scores = features.GetScoresForProducer( ff );
+ for (size_t j = 0; j<scores.size(); ++j) {
+ out << " " << scores[j];
+ }
+ }
+
+ // sparse features
+ const FVector scores = features.GetVectorForProducer( ff );
+ for(FVector::FNVmap::const_iterator i = scores.cbegin(); i != scores.cend(); i++) {
+ out << " " << i->first << "= " << i->second;
+ }
+}
+
+/***
+ * print surface factor only for the given phrase
+ */
+void BaseManager::OutputSurface(std::ostream &out, const Phrase &phrase,
+ const std::vector<FactorType> &outputFactorOrder,
+ bool reportAllFactors) const
+{
+ UTIL_THROW_IF2(outputFactorOrder.size() == 0,
+ "Cannot be empty phrase");
+ if (reportAllFactors == true) {
+ out << phrase;
+ } else {
+ size_t size = phrase.GetSize();
+ for (size_t pos = 0 ; pos < size ; pos++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
+ out << *factor;
+ UTIL_THROW_IF2(factor == NULL,
+ "Empty factor 0 at position " << pos);
+
+ for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
+ UTIL_THROW_IF2(factor == NULL,
+ "Empty factor " << i << " at position " << pos);
+
+ out << "|" << *factor;
+ }
+ out << " ";
+ }
+ }
+}
+
+// Emulates the old operator<<(ostream &, const DottedRule &) function. The
+// output format is a bit odd (reverse order and double spacing between symbols)
+// but there are scripts and tools that expect the output of -T to look like
+// that.
+void BaseManager::WriteApplicationContext(std::ostream &out,
+ const ApplicationContext &context) const
+{
+ assert(!context.empty());
+ ApplicationContext::const_reverse_iterator p = context.rbegin();
+ while (true) {
+ out << p->second << "=" << p->first << " ";
+ if (++p == context.rend()) {
+ break;
+ }
+ out << " ";
+ }
+}
+
+} // namespace
+
+
diff --git a/moses/BaseManager.h b/moses/BaseManager.h
new file mode 100644
index 000000000..f3869f6b2
--- /dev/null
+++ b/moses/BaseManager.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include "ScoreComponentCollection.h"
+#include "InputType.h"
+
+namespace Moses
+{
+class ScoreComponentCollection;
+class FeatureFunction;
+class OutputCollector;
+
+class BaseManager
+{
+protected:
+ const InputType &m_source; /**< source sentence to be translated */
+
+ BaseManager(const InputType &source)
+ :m_source(source)
+ {}
+
+ // output
+ typedef std::vector<std::pair<Moses::Word, Moses::WordsRange> > ApplicationContext;
+ typedef std::set< std::pair<size_t, size_t> > Alignments;
+
+ void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features,
+ std::ostream &out) const;
+ void OutputFeatureScores( std::ostream& out,
+ const ScoreComponentCollection &features,
+ const FeatureFunction *ff,
+ std::string &lastName ) const;
+ void OutputSurface(std::ostream &out,
+ const Phrase &phrase,
+ const std::vector<FactorType> &outputFactorOrder,
+ bool reportAllFactors) const;
+ void WriteApplicationContext(std::ostream &out,
+ const ApplicationContext &context) const;
+
+ template <class T>
+ void ShiftOffsets(std::vector<T> &offsets, T shift) const
+ {
+ T currPos = shift;
+ for (size_t i = 0; i < offsets.size(); ++i) {
+ if (offsets[i] == 0) {
+ offsets[i] = currPos;
+ ++currPos;
+ } else {
+ currPos += offsets[i];
+ }
+ }
+ }
+
+public:
+ //! the input sentence being decoded
+ const InputType& GetSource() const {
+ return m_source;
+ }
+
+ virtual void Decode() = 0;
+ // outputs
+ virtual void OutputNBest(OutputCollector *collector) const = 0;
+ virtual void OutputLatticeSamples(OutputCollector *collector) const = 0;
+ virtual void OutputAlignment(OutputCollector *collector) const = 0;
+ virtual void OutputDetailedTranslationReport(OutputCollector *collector) const = 0;
+ virtual void OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const = 0;
+ virtual void OutputWordGraph(OutputCollector *collector) const = 0;
+ virtual void OutputSearchGraph(OutputCollector *collector) const = 0;
+ virtual void OutputSearchGraphSLF() const = 0;
+ virtual void OutputSearchGraphHypergraph() const = 0;
+
+
+};
+
+}
diff --git a/moses/BitmapContainer.cpp b/moses/BitmapContainer.cpp
index accd694ea..061a5953f 100644
--- a/moses/BitmapContainer.cpp
+++ b/moses/BitmapContainer.cpp
@@ -53,14 +53,30 @@ class HypothesisScoreOrdererWithDistortion
{
public:
HypothesisScoreOrdererWithDistortion(const WordsRange* transOptRange) :
- m_transOptRange(transOptRange) {}
+ m_transOptRange(transOptRange) {
+ m_totalWeightDistortion = 0;
+ const StaticData &staticData = StaticData::Instance();
+ const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
+ std::vector<FeatureFunction*>::const_iterator iter;
+ for (iter = ffs.begin(); iter != ffs.end(); ++iter) {
+ const FeatureFunction *ff = *iter;
+
+ const DistortionScoreProducer *model = dynamic_cast<const DistortionScoreProducer*>(ff);
+ if (model) {
+ float weight =staticData.GetAllWeights().GetScoreForProducer(model);
+ m_totalWeightDistortion += weight;
+ }
+ }
+
+
+ }
const WordsRange* m_transOptRange;
+ float m_totalWeightDistortion;
bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const {
UTIL_THROW_IF2(m_transOptRange == NULL, "Words range not set");
- const StaticData &staticData = StaticData::Instance();
const float distortionScoreA = DistortionScoreProducer::CalculateDistortionScore(
*hypoA,
@@ -76,20 +92,8 @@ public:
);
- float totalWeightDistortion = 0;
- const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
- std::vector<FeatureFunction*>::const_iterator iter;
- for (iter = ffs.begin(); iter != ffs.end(); ++iter) {
- const FeatureFunction *ff = *iter;
-
- const DistortionScoreProducer *model = dynamic_cast<const DistortionScoreProducer*>(ff);
- if (model) {
- float weight =staticData.GetAllWeights().GetScoreForProducer(model);
- totalWeightDistortion += weight;
- }
- }
- const float scoreA = hypoA->GetScore() + distortionScoreA * totalWeightDistortion;
- const float scoreB = hypoB->GetScore() + distortionScoreB * totalWeightDistortion;
+ const float scoreA = hypoA->GetScore() + distortionScoreA * m_totalWeightDistortion;
+ const float scoreB = hypoB->GetScore() + distortionScoreB * m_totalWeightDistortion;
if (scoreA > scoreB) {
@@ -162,12 +166,16 @@ BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer
if (m_translations.size() > 1) {
UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
- "Non-monotonic future score");
+ "Non-monotonic future score: "
+ << m_translations.Get(0)->GetFutureScore() << " vs. "
+ << m_translations.Get(1)->GetFutureScore());
}
if (m_hypotheses.size() > 1) {
UTIL_THROW_IF2(m_hypotheses[0]->GetTotalScore() < m_hypotheses[1]->GetTotalScore(),
- "Non-monotonic total score");
+ "Non-monotonic total score"
+ << m_hypotheses[0]->GetTotalScore() << " vs. "
+ << m_hypotheses[1]->GetTotalScore());
}
HypothesisScoreOrdererWithDistortion orderer (&transOptRange);
@@ -207,7 +215,7 @@ Hypothesis *BackwardsEdge::CreateHypothesis(const Hypothesis &hypothesis, const
IFVERBOSE(2) {
hypothesis.GetManager().GetSentenceStats().StopTimeBuildHyp();
}
- newHypo->Evaluate(m_futurescore);
+ newHypo->EvaluateWhenApplied(m_futurescore);
return newHypo;
}
@@ -442,7 +450,9 @@ BitmapContainer::ProcessBestHypothesis()
if (!Empty()) {
HypothesisQueueItem *check = Dequeue(true);
UTIL_THROW_IF2(item->GetHypothesis()->GetTotalScore() < check->GetHypothesis()->GetTotalScore(),
- "Non-monotonic total score");
+ "Non-monotonic total score: "
+ << item->GetHypothesis()->GetTotalScore() << " vs. "
+ << check->GetHypothesis()->GetTotalScore());
}
// Logging for the criminally insane
diff --git a/moses/ChartCell.cpp b/moses/ChartCell.cpp
index 125efd204..1d84ba71b 100644
--- a/moses/ChartCell.cpp
+++ b/moses/ChartCell.cpp
@@ -22,6 +22,7 @@
#include <algorithm>
#include "ChartCell.h"
#include "ChartCellCollection.h"
+#include "HypergraphOutput.h"
#include "RuleCubeQueue.h"
#include "RuleCube.h"
#include "WordsRange.h"
@@ -59,7 +60,7 @@ ChartCell::~ChartCell() {}
/** Add the given hypothesis to the cell.
* Returns true if added, false if not. Maybe it already exists in the collection or score falls below threshold etc.
- * This function just calls the correspondind AddHypothesis() in ChartHypothesisCollection
+ * This function just calls the corresponding AddHypothesis() in ChartHypothesisCollection
* \param hypo Hypothesis to be added
*/
bool ChartCell::AddHypothesis(ChartHypothesis *hypo)
@@ -195,13 +196,13 @@ const HypoList *ChartCell::GetAllSortedHypotheses() const
return ret;
}
-//! call GetSearchGraph() for each hypo collection
-void ChartCell::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned, bool> &reachable) const
+//! call WriteSearchGraph() for each hypo collection
+void ChartCell::WriteSearchGraph(const ChartSearchGraphWriter& writer, const std::map<unsigned, bool> &reachable) const
{
MapType::const_iterator iterOutside;
for (iterOutside = m_hypoColl.begin(); iterOutside != m_hypoColl.end(); ++iterOutside) {
const ChartHypothesisCollection &coll = iterOutside->second;
- coll.GetSearchGraph(translationId, outputSearchGraphStream, reachable);
+ coll.WriteSearchGraph(writer, reachable);
}
}
diff --git a/moses/ChartCell.h b/moses/ChartCell.h
index 1fed695ac..99bc90866 100644
--- a/moses/ChartCell.h
+++ b/moses/ChartCell.h
@@ -40,6 +40,7 @@
namespace Moses
{
+class ChartSearchGraphWriter;
class ChartTranslationOptionList;
class ChartCellCollection;
class ChartManager;
@@ -124,7 +125,7 @@ public:
return m_coverage < compare.m_coverage;
}
- void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const;
+ void WriteSearchGraph(const ChartSearchGraphWriter& writer, const std::map<unsigned,bool> &reachable) const;
};
diff --git a/moses/ChartCellCollection.h b/moses/ChartCellCollection.h
index d0423b0b2..1edeb4450 100644
--- a/moses/ChartCellCollection.h
+++ b/moses/ChartCellCollection.h
@@ -20,11 +20,11 @@
***********************************************************************/
#pragma once
+#include <boost/ptr_container/ptr_vector.hpp>
#include "InputType.h"
#include "ChartCell.h"
#include "WordsRange.h"
-
-#include <boost/ptr_container/ptr_vector.hpp>
+#include "InputPath.h"
namespace Moses
{
@@ -36,6 +36,7 @@ class ChartCellCollectionBase
public:
template <class Factory> ChartCellCollectionBase(const InputType &input, const Factory &factory) :
m_cells(input.GetSize()) {
+
size_t size = input.GetSize();
for (size_t startPos = 0; startPos < size; ++startPos) {
std::vector<ChartCellBase*> &inner = m_cells[startPos];
@@ -47,12 +48,15 @@ public:
* gets it from there :-(. The span is actually stored as a reference,
* which needs to point somewhere, so I have it refer to the ChartCell.
*/
- m_source.push_back(new ChartCellLabel(inner[0]->GetCoverage(), input.GetWord(startPos)));
+ const WordsRange &range = inner[0]->GetCoverage();
+
+ m_source.push_back(new ChartCellLabel(range, input.GetWord(startPos)));
}
}
virtual ~ChartCellCollectionBase();
+
const ChartCellBase &GetBase(const WordsRange &coverage) const {
return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()];
}
@@ -70,6 +74,7 @@ private:
std::vector<std::vector<ChartCellBase*> > m_cells;
boost::ptr_vector<ChartCellLabel> m_source;
+
};
/** Hold all the chart cells for 1 input sentence. A variable of this type is held by the ChartManager
diff --git a/moses/ChartCellLabel.h b/moses/ChartCellLabel.h
index 144a64add..c67d985b2 100644
--- a/moses/ChartCellLabel.h
+++ b/moses/ChartCellLabel.h
@@ -90,6 +90,7 @@ public:
private:
const WordsRange &m_coverage;
const Word &m_label;
+ //const InputPath &m_inputPath;
Stack m_stack;
mutable float m_bestScore;
};
diff --git a/moses/ChartCellLabelSet.h b/moses/ChartCellLabelSet.h
index 2c1e8b50f..591aa17a6 100644
--- a/moses/ChartCellLabelSet.h
+++ b/moses/ChartCellLabelSet.h
@@ -72,6 +72,8 @@ public:
size_t idx = w[0]->GetId();
if (! ChartCellExists(idx)) {
m_size++;
+
+
m_map[idx] = new ChartCellLabel(m_coverage, w);
}
}
@@ -119,6 +121,15 @@ public:
}
}
+ const ChartCellLabel *Find(size_t idx) const {
+ try {
+ return m_map.at(idx);
+ }
+ catch (const std::out_of_range& oor) {
+ return NULL;
+ }
+ }
+
ChartCellLabel::Stack &FindOrInsert(const Word &w) {
size_t idx = w[0]->GetId();
if (! ChartCellExists(idx)) {
diff --git a/moses/ChartHypothesis.cpp b/moses/ChartHypothesis.cpp
index 83d95aee9..5227d91d2 100644
--- a/moses/ChartHypothesis.cpp
+++ b/moses/ChartHypothesis.cpp
@@ -149,6 +149,40 @@ Phrase ChartHypothesis::GetOutputPhrase() const
return outPhrase;
}
+void ChartHypothesis::GetOutputPhrase(int leftRightMost, int numWords, Phrase &outPhrase) const
+{
+ const TargetPhrase &tp = GetCurrTargetPhrase();
+
+ int targetSize = tp.GetSize();
+ for (int i = 0; i < targetSize; ++i) {
+ int pos;
+ if (leftRightMost == 1) {
+ pos = i;
+ }
+ else if (leftRightMost == 2) {
+ pos = targetSize - i - 1;
+ }
+ else {
+ abort();
+ }
+
+ const Word &word = tp.GetWord(pos);
+
+ if (word.IsNonTerminal()) {
+ // non-term. fill out with prev hypo
+ size_t nonTermInd = tp.GetAlignNonTerm().GetNonTermIndexMap()[pos];
+ const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd];
+ prevHypo->GetOutputPhrase(outPhrase);
+ } else {
+ outPhrase.AddWord(word);
+ }
+
+ if (outPhrase.GetSize() >= numWords) {
+ return;
+ }
+ }
+}
+
/** check, if two hypothesis can be recombined.
this is actually a sorting function that allows us to
keep an ordered list of hypotheses. This makes recombination
@@ -178,7 +212,7 @@ int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
/** calculate total score
* @todo this should be in ScoreBreakdown
*/
-void ChartHypothesis::Evaluate()
+void ChartHypothesis::EvaluateWhenApplied()
{
const StaticData &staticData = StaticData::Instance();
// total scores from prev hypos
@@ -200,7 +234,7 @@ void ChartHypothesis::Evaluate()
StatelessFeatureFunction::GetStatelessFeatureFunctions();
for (unsigned i = 0; i < sfs.size(); ++i) {
if (! staticData.IsFeatureFunctionIgnored( *sfs[i] )) {
- sfs[i]->EvaluateChart(*this,&m_scoreBreakdown);
+ sfs[i]->EvaluateWhenApplied(*this,&m_scoreBreakdown);
}
}
@@ -208,7 +242,7 @@ void ChartHypothesis::Evaluate()
StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i) {
if (! staticData.IsFeatureFunctionIgnored( *ffs[i] )) {
- m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
+ m_ffStates[i] = ffs[i]->EvaluateWhenApplied(*this,i,&m_scoreBreakdown);
}
}
@@ -259,7 +293,7 @@ void ChartHypothesis::CleanupArcList()
*/
const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize();
- bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph();
+ bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphHypergraph();
if (!distinctNBest && m_arcList->size() > nBestSize) {
// prune arc list only if there too many arcs
diff --git a/moses/ChartHypothesis.h b/moses/ChartHypothesis.h
index 12050e764..8dc26e721 100644
--- a/moses/ChartHypothesis.h
+++ b/moses/ChartHypothesis.h
@@ -138,9 +138,13 @@ public:
void GetOutputPhrase(Phrase &outPhrase) const;
Phrase GetOutputPhrase() const;
+ // get leftmost/rightmost words only
+ // leftRightMost: 1=left, 2=right
+ void GetOutputPhrase(int leftRightMost, int numWords, Phrase &outPhrase) const;
+
int RecombineCompare(const ChartHypothesis &compare) const;
- void Evaluate();
+ void EvaluateWhenApplied();
void AddArc(ChartHypothesis *loserHypo);
void CleanupArcList();
diff --git a/moses/ChartHypothesisCollection.cpp b/moses/ChartHypothesisCollection.cpp
index ee155b103..d53211f34 100644
--- a/moses/ChartHypothesisCollection.cpp
+++ b/moses/ChartHypothesisCollection.cpp
@@ -24,6 +24,7 @@
#include "ChartHypothesisCollection.h"
#include "ChartHypothesis.h"
#include "ChartManager.h"
+#include "HypergraphOutput.h"
#include "util/exception.hh"
using namespace std;
@@ -55,7 +56,7 @@ ChartHypothesisCollection::~ChartHypothesisCollection()
/** public function to add hypothesis to this collection.
* Returns false if equiv hypo exists in collection, otherwise returns true.
* Takes care of update arc list for n-best list creation.
- * Will delete hypo is it exist - once this function is call don't delete hypothesis.
+ * Will delete hypo if it exists - once this function is call don't delete hypothesis.
* \param hypo hypothesis to add
* \param manager pointer back to manager
*/
@@ -293,27 +294,9 @@ void ChartHypothesisCollection::CleanupArcList()
* \param outputSearchGraphStream stream to output the info to
* \param reachable @todo don't know
*/
-void ChartHypothesisCollection::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned, bool> &reachable) const
+void ChartHypothesisCollection::WriteSearchGraph(const ChartSearchGraphWriter& writer, const std::map<unsigned, bool> &reachable) const
{
- HCType::const_iterator iter;
- for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter) {
- ChartHypothesis &mainHypo = **iter;
- if (StaticData::Instance().GetUnprunedSearchGraph() ||
- reachable.find(mainHypo.GetId()) != reachable.end()) {
- outputSearchGraphStream << translationId << " " << mainHypo << endl;
- }
-
- const ChartArcList *arcList = mainHypo.GetArcList();
- if (arcList) {
- ChartArcList::const_iterator iterArc;
- for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
- const ChartHypothesis &arc = **iterArc;
- if (reachable.find(arc.GetId()) != reachable.end()) {
- outputSearchGraphStream << translationId << " " << arc << endl;
- }
- }
- }
- }
+ writer.WriteHypos(*this,reachable);
}
std::ostream& operator<<(std::ostream &out, const ChartHypothesisCollection &coll)
diff --git a/moses/ChartHypothesisCollection.h b/moses/ChartHypothesisCollection.h
index 438c2dd33..37cd907d9 100644
--- a/moses/ChartHypothesisCollection.h
+++ b/moses/ChartHypothesisCollection.h
@@ -28,6 +28,8 @@
namespace Moses
{
+class ChartSearchGraphWriter;
+
//! functor to compare (chart) hypotheses by (descending) score
class ChartHypothesisScoreOrderer
{
@@ -117,7 +119,7 @@ public:
return m_bestScore;
}
- void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const;
+ void WriteSearchGraph(const ChartSearchGraphWriter& writer, const std::map<unsigned,bool> &reachable) const;
};
diff --git a/moses/ChartKBestExtractor.cpp b/moses/ChartKBestExtractor.cpp
index 8de77f139..cb1097c8a 100644
--- a/moses/ChartKBestExtractor.cpp
+++ b/moses/ChartKBestExtractor.cpp
@@ -124,6 +124,35 @@ Phrase ChartKBestExtractor::GetOutputPhrase(const Derivation &d)
return ret;
}
+// Generate the target tree of the derivation d.
+TreePointer ChartKBestExtractor::GetOutputTree(const Derivation &d)
+{
+ const ChartHypothesis &hypo = d.edge.head->hypothesis;
+ const TargetPhrase &phrase = hypo.GetCurrTargetPhrase();
+ if (const PhraseProperty *property = phrase.GetProperty("Tree")) {
+ const std::string *tree = property->GetValueString();
+ TreePointer mytree (boost::make_shared<InternalTree>(*tree));
+
+ //get subtrees (in target order)
+ std::vector<TreePointer> previous_trees;
+ for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
+ const Word &word = phrase.GetWord(pos);
+ if (word.IsNonTerminal()) {
+ size_t nonTermInd = phrase.GetAlignNonTerm().GetNonTermIndexMap()[pos];
+ const Derivation &subderivation = *d.subderivations[nonTermInd];
+ const TreePointer prev_tree = GetOutputTree(subderivation);
+ previous_trees.push_back(prev_tree);
+ }
+ }
+
+ mytree->Combine(previous_trees);
+ return mytree;
+ }
+ else {
+ UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
+ }
+}
+
// Create an unweighted hyperarc corresponding to the given ChartHypothesis.
ChartKBestExtractor::UnweightedHyperarc ChartKBestExtractor::CreateEdge(
const ChartHypothesis &h)
diff --git a/moses/ChartKBestExtractor.h b/moses/ChartKBestExtractor.h
index 416d425b2..e28150454 100644
--- a/moses/ChartKBestExtractor.h
+++ b/moses/ChartKBestExtractor.h
@@ -22,6 +22,7 @@
#include <cassert>
#include "ChartHypothesis.h"
#include "ScoreComponentCollection.h"
+#include "FF/InternalTree.h"
#include <boost/unordered_set.hpp>
#include <boost/weak_ptr.hpp>
@@ -89,6 +90,7 @@ public:
std::size_t k, KBestVec &);
static Phrase GetOutputPhrase(const Derivation &);
+ static TreePointer GetOutputTree(const Derivation &);
private:
typedef boost::unordered_map<const ChartHypothesis *,
diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp
index 51145d31d..44f3ab75f 100644
--- a/moses/ChartManager.cpp
+++ b/moses/ChartManager.cpp
@@ -25,13 +25,17 @@
#include "ChartHypothesis.h"
#include "ChartKBestExtractor.h"
#include "ChartTranslationOptions.h"
+#include "HypergraphOutput.h"
#include "StaticData.h"
#include "DecodeStep.h"
#include "TreeInput.h"
+#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/WordPenaltyProducer.h"
+#include "moses/OutputCollector.h"
+#include "moses/ChartKBestExtractor.h"
+#include "moses/HypergraphOutput.h"
using namespace std;
-using namespace Moses;
namespace Moses
{
@@ -42,7 +46,7 @@ extern bool g_mosesDebug;
* \param system which particular set of models to use.
*/
ChartManager::ChartManager(InputType const& source)
- :m_source(source)
+ :BaseManager(source)
,m_hypoStackColl(source, *this)
,m_start(clock())
,m_hypothesisId(0)
@@ -61,7 +65,7 @@ ChartManager::~ChartManager()
}
//! decode the sentence. This contains the main laps. Basically, the CKY++ algorithm
-void ChartManager::ProcessSentence()
+void ChartManager::Decode()
{
VERBOSE(1,"Translating: " << m_source << endl);
@@ -85,7 +89,7 @@ void ChartManager::ProcessSentence()
m_translationOptionList.ApplyThreshold();
const InputPath &inputPath = m_parser.GetInputPath(range);
- m_translationOptionList.Evaluate(m_source, inputPath);
+ m_translationOptionList.EvaluateWithSourceContext(m_source, inputPath);
// decode
ChartCell &cell = m_hypoStackColl.Get(range);
@@ -125,7 +129,7 @@ void ChartManager::ProcessSentence()
*/
void ChartManager::AddXmlChartOptions()
{
- const StaticData &staticData = StaticData::Instance();
+ // const StaticData &staticData = StaticData::Instance();
const std::vector <ChartTranslationOptions*> xmlChartOptionsList = m_source.GetXmlChartTranslationOptions();
IFVERBOSE(2) {
@@ -141,7 +145,7 @@ void ChartManager::AddXmlChartOptions()
RuleCubeItem* item = new RuleCubeItem( *opt, m_hypoStackColl );
ChartHypothesis* hypo = new ChartHypothesis(*opt, *item, *this);
- hypo->Evaluate();
+ hypo->EvaluateWhenApplied();
ChartCell &cell = m_hypoStackColl.Get(range);
@@ -222,8 +226,9 @@ void ChartManager::CalcNBest(
}
}
-void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const
+void ChartManager::WriteSearchGraph(const ChartSearchGraphWriter& writer) const
{
+
size_t size = m_source.GetSize();
// which hypotheses are reachable?
@@ -236,7 +241,11 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
// no hypothesis
return;
}
- FindReachableHypotheses( hypo, reachable);
+ size_t winners = 0;
+ size_t losers = 0;
+
+ FindReachableHypotheses( hypo, reachable, &winners, &losers);
+ writer.WriteHeader(winners, losers);
for (size_t width = 1; width <= size; ++width) {
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
@@ -245,12 +254,13 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
TRACE_ERR(" " << range << "=");
const ChartCell &cell = m_hypoStackColl.Get(range);
- cell.GetSearchGraph(translationId, outputSearchGraphStream, reachable);
+ cell.WriteSearchGraph(writer, reachable);
}
}
}
-void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const
+void ChartManager::FindReachableHypotheses(
+ const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable, size_t* winners, size_t* losers) const
{
// do not recurse, if already visited
if (reachable.find(hypo->GetId()) != reachable.end()) {
@@ -259,9 +269,14 @@ void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::ma
// recurse
reachable[ hypo->GetId() ] = true;
+ if (hypo->GetWinningHypothesis() == hypo) {
+ (*winners)++;
+ } else {
+ (*losers)++;
+ }
const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i) {
- FindReachableHypotheses( *i, reachable );
+ FindReachableHypotheses( *i, reachable, winners, losers );
}
// also loop over recombined hypotheses (arcs)
@@ -270,9 +285,526 @@ void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::ma
ChartArcList::const_iterator iterArc;
for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
const ChartHypothesis &arc = **iterArc;
- FindReachableHypotheses( &arc, reachable );
+ FindReachableHypotheses( &arc, reachable, winners, losers );
+ }
+ }
+}
+
+void ChartManager::OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const {
+ ChartSearchGraphWriterHypergraph writer(&outputSearchGraphStream);
+ WriteSearchGraph(writer);
+}
+
+void ChartManager::OutputSearchGraphMoses(std::ostream &outputSearchGraphStream) const {
+ ChartSearchGraphWriterMoses writer(&outputSearchGraphStream, m_source.GetTranslationId());
+ WriteSearchGraph(writer);
+}
+
+void ChartManager::OutputNBest(OutputCollector *collector) const
+{
+ const StaticData &staticData = StaticData::Instance();
+ size_t nBestSize = staticData.GetNBestSize();
+ if (nBestSize > 0) {
+ const size_t translationId = m_source.GetTranslationId();
+
+ VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl);
+ std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
+ CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest());
+ OutputNBestList(collector, nBestList, translationId);
+ IFVERBOSE(2) {
+ PrintUserTime("N-Best Hypotheses Generation Time:");
+ }
+ }
+
+}
+
+void ChartManager::OutputNBestList(OutputCollector *collector,
+ const ChartKBestExtractor::KBestVec &nBestList,
+ long translationId) const
+{
+ const StaticData &staticData = StaticData::Instance();
+ const std::vector<Moses::FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder();
+
+ std::ostringstream out;
+
+ if (collector->OutputIsCout()) {
+ // Set precision only if we're writing the n-best list to cout. This is to
+ // preserve existing behaviour, but should probably be done either way.
+ FixPrecision(out);
+ }
+
+ bool includeWordAlignment =
+ StaticData::Instance().PrintAlignmentInfoInNbest();
+
+ bool PrintNBestTrees = StaticData::Instance().PrintNBestTrees();
+
+ for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
+ p != nBestList.end(); ++p) {
+ const ChartKBestExtractor::Derivation &derivation = **p;
+
+ // get the derivation's target-side yield
+ Phrase outputPhrase = ChartKBestExtractor::GetOutputPhrase(derivation);
+
+ // delete <s> and </s>
+ UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+ outputPhrase.RemoveWord(0);
+ outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
+
+ // print the translation ID, surface factors, and scores
+ out << translationId << " ||| ";
+ OutputSurface(out, outputPhrase, outputFactorOrder, false);
+ out << " ||| ";
+ OutputAllFeatureScores(derivation.scoreBreakdown, out);
+ out << " ||| " << derivation.score;
+
+ // optionally, print word alignments
+ if (includeWordAlignment) {
+ out << " ||| ";
+ Alignments align;
+ OutputAlignmentNBest(align, derivation, 0);
+ for (Alignments::const_iterator q = align.begin(); q != align.end();
+ ++q) {
+ out << q->first << "-" << q->second << " ";
+ }
+ }
+
+ // optionally, print tree
+ if (PrintNBestTrees) {
+ TreePointer tree = ChartKBestExtractor::GetOutputTree(derivation);
+ out << " ||| " << tree->GetString();
+ }
+
+ out << std::endl;
+ }
+
+ assert(collector);
+ collector->Write(translationId, out.str());
+}
+
+size_t ChartManager::CalcSourceSize(const Moses::ChartHypothesis *hypo) const
+{
+ size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
+ const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+ for (size_t i = 0; i < prevHypos.size(); ++i) {
+ size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
+ ret -= (childSize - 1);
+ }
+ return ret;
+}
+
+size_t ChartManager::OutputAlignmentNBest(
+ Alignments &retAlign,
+ const Moses::ChartKBestExtractor::Derivation &derivation,
+ size_t startTarget) const
+{
+ const ChartHypothesis &hypo = derivation.edge.head->hypothesis;
+
+ size_t totalTargetSize = 0;
+ size_t startSource = hypo.GetCurrSourceRange().GetStartPos();
+
+ const TargetPhrase &tp = hypo.GetCurrTargetPhrase();
+
+ size_t thisSourceSize = CalcSourceSize(&hypo);
+
+ // position of each terminal word in translation rule, irrespective of alignment
+ // if non-term, number is undefined
+ vector<size_t> sourceOffsets(thisSourceSize, 0);
+ vector<size_t> targetOffsets(tp.GetSize(), 0);
+
+ const AlignmentInfo &aiNonTerm = hypo.GetCurrTargetPhrase().GetAlignNonTerm();
+ vector<size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
+ const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap();
+
+ UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(),
+ "Error");
+
+ size_t targetInd = 0;
+ for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
+ if (tp.GetWord(targetPos).IsNonTerminal()) {
+ UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
+ size_t sourceInd = targetPos2SourceInd[targetPos];
+ size_t sourcePos = sourceInd2pos[sourceInd];
+
+ const Moses::ChartKBestExtractor::Derivation &subderivation =
+ *derivation.subderivations[sourceInd];
+
+ // calc source size
+ size_t sourceSize = subderivation.edge.head->hypothesis.GetCurrSourceRange().GetNumWordsCovered();
+ sourceOffsets[sourcePos] = sourceSize;
+
+ // calc target size.
+ // Recursively look thru child hypos
+ size_t currStartTarget = startTarget + totalTargetSize;
+ size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
+ currStartTarget);
+ targetOffsets[targetPos] = targetSize;
+
+ totalTargetSize += targetSize;
+ ++targetInd;
+ } else {
+ ++totalTargetSize;
+ }
+ }
+
+ // convert position within translation rule to absolute position within
+ // source sentence / output sentence
+ ShiftOffsets(sourceOffsets, startSource);
+ ShiftOffsets(targetOffsets, startTarget);
+
+ // get alignments from this hypo
+ const AlignmentInfo &aiTerm = hypo.GetCurrTargetPhrase().GetAlignTerm();
+
+ // add to output arg, offsetting by source & target
+ AlignmentInfo::const_iterator iter;
+ for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+ const std::pair<size_t,size_t> &align = *iter;
+ size_t relSource = align.first;
+ size_t relTarget = align.second;
+ size_t absSource = sourceOffsets[relSource];
+ size_t absTarget = targetOffsets[relTarget];
+
+ pair<size_t, size_t> alignPoint(absSource, absTarget);
+ pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+ UTIL_THROW_IF2(!ret.second, "Error");
+ }
+
+ return totalTargetSize;
+}
+
+void ChartManager::OutputAlignment(OutputCollector *collector) const
+{
+ if (collector == NULL) {
+ return;
+ }
+
+ ostringstream out;
+
+ const ChartHypothesis *hypo = GetBestHypothesis();
+ if (hypo) {
+ Alignments retAlign;
+ OutputAlignment(retAlign, hypo, 0);
+
+ // output alignments
+ Alignments::const_iterator iter;
+ for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
+ const pair<size_t, size_t> &alignPoint = *iter;
+ out << alignPoint.first << "-" << alignPoint.second << " ";
+ }
+ }
+ out << endl;
+
+ collector->Write(m_source.GetTranslationId(), out.str());
+
+}
+
+size_t ChartManager::OutputAlignment(Alignments &retAlign,
+ const Moses::ChartHypothesis *hypo,
+ size_t startTarget) const
+{
+ size_t totalTargetSize = 0;
+ size_t startSource = hypo->GetCurrSourceRange().GetStartPos();
+
+ const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
+
+ size_t thisSourceSize = CalcSourceSize(hypo);
+
+ // position of each terminal word in translation rule, irrespective of alignment
+ // if non-term, number is undefined
+ vector<size_t> sourceOffsets(thisSourceSize, 0);
+ vector<size_t> targetOffsets(tp.GetSize(), 0);
+
+ const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+
+ const AlignmentInfo &aiNonTerm = hypo->GetCurrTargetPhrase().GetAlignNonTerm();
+ vector<size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
+ const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap();
+
+ UTIL_THROW_IF2(sourceInd2pos.size() != prevHypos.size(), "Error");
+
+ size_t targetInd = 0;
+ for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
+ if (tp.GetWord(targetPos).IsNonTerminal()) {
+ UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
+ size_t sourceInd = targetPos2SourceInd[targetPos];
+ size_t sourcePos = sourceInd2pos[sourceInd];
+
+ const ChartHypothesis *prevHypo = prevHypos[sourceInd];
+
+ // calc source size
+ size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
+ sourceOffsets[sourcePos] = sourceSize;
+
+ // calc target size.
+ // Recursively look thru child hypos
+ size_t currStartTarget = startTarget + totalTargetSize;
+ size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
+ targetOffsets[targetPos] = targetSize;
+
+ totalTargetSize += targetSize;
+ ++targetInd;
+ } else {
+ ++totalTargetSize;
}
}
+
+ // convert position within translation rule to absolute position within
+ // source sentence / output sentence
+ ShiftOffsets(sourceOffsets, startSource);
+ ShiftOffsets(targetOffsets, startTarget);
+
+ // get alignments from this hypo
+ const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
+
+ // add to output arg, offsetting by source & target
+ AlignmentInfo::const_iterator iter;
+ for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+ const std::pair<size_t,size_t> &align = *iter;
+ size_t relSource = align.first;
+ size_t relTarget = align.second;
+ size_t absSource = sourceOffsets[relSource];
+ size_t absTarget = targetOffsets[relTarget];
+
+ pair<size_t, size_t> alignPoint(absSource, absTarget);
+ pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+ UTIL_THROW_IF2(!ret.second, "Error");
+
+ }
+
+ return totalTargetSize;
+}
+
+void ChartManager::OutputDetailedTranslationReport(OutputCollector *collector) const
+{
+ if (collector) {
+ OutputDetailedTranslationReport(collector,
+ GetBestHypothesis(),
+ static_cast<const Sentence&>(m_source),
+ m_source.GetTranslationId());
+ }
+}
+
+void ChartManager::OutputDetailedTranslationReport(
+ OutputCollector *collector,
+ const ChartHypothesis *hypo,
+ const Sentence &sentence,
+ long translationId) const
+{
+ if (hypo == NULL) {
+ return;
+ }
+ std::ostringstream out;
+ ApplicationContext applicationContext;
+
+ OutputTranslationOptions(out, applicationContext, hypo, sentence, translationId);
+ collector->Write(translationId, out.str());
+
+ //DIMw
+ const StaticData &staticData = StaticData::Instance();
+
+ if (staticData.IsDetailedAllTranslationReportingEnabled()) {
+ const Sentence &sentence = dynamic_cast<const Sentence &>(m_source);
+ size_t nBestSize = staticData.GetNBestSize();
+ std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
+ CalcNBest(nBestSize, nBestList, staticData.GetDistinctNBest());
+ OutputDetailedAllTranslationReport(collector, nBestList, sentence, translationId);
+ }
+
+}
+
+void ChartManager::OutputTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const ChartHypothesis *hypo,
+ const Sentence &sentence,
+ long translationId) const
+{
+ if (hypo != NULL) {
+ OutputTranslationOption(out, applicationContext, hypo, sentence, translationId);
+ out << std::endl;
+ }
+
+ // recursive
+ const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+ std::vector<const ChartHypothesis*>::const_iterator iter;
+ for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
+ const ChartHypothesis *prevHypo = *iter;
+ OutputTranslationOptions(out, applicationContext, prevHypo, sentence, translationId);
+ }
+}
+
+void ChartManager::OutputTranslationOption(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const ChartHypothesis *hypo,
+ const Sentence &sentence,
+ long translationId) const
+{
+ ReconstructApplicationContext(*hypo, sentence, applicationContext);
+ out << "Trans Opt " << translationId
+ << " " << hypo->GetCurrSourceRange()
+ << ": ";
+ WriteApplicationContext(out, applicationContext);
+ out << ": " << hypo->GetCurrTargetPhrase().GetTargetLHS()
+ << "->" << hypo->GetCurrTargetPhrase()
+ << " " << hypo->GetTotalScore() << hypo->GetScoreBreakdown();
+}
+
+// Given a hypothesis and sentence, reconstructs the 'application context' --
+// the source RHS symbols of the SCFG rule that was applied, plus their spans.
+void ChartManager::ReconstructApplicationContext(const ChartHypothesis &hypo,
+ const Sentence &sentence,
+ ApplicationContext &context) const
+{
+ context.clear();
+ const std::vector<const ChartHypothesis*> &prevHypos = hypo.GetPrevHypos();
+ std::vector<const ChartHypothesis*>::const_iterator p = prevHypos.begin();
+ std::vector<const ChartHypothesis*>::const_iterator end = prevHypos.end();
+ const WordsRange &span = hypo.GetCurrSourceRange();
+ size_t i = span.GetStartPos();
+ while (i <= span.GetEndPos()) {
+ if (p == end || i < (*p)->GetCurrSourceRange().GetStartPos()) {
+ // Symbol is a terminal.
+ const Word &symbol = sentence.GetWord(i);
+ context.push_back(std::make_pair(symbol, WordsRange(i, i)));
+ ++i;
+ } else {
+ // Symbol is a non-terminal.
+ const Word &symbol = (*p)->GetTargetLHS();
+ const WordsRange &range = (*p)->GetCurrSourceRange();
+ context.push_back(std::make_pair(symbol, range));
+ i = range.GetEndPos()+1;
+ ++p;
+ }
+ }
+}
+
+void ChartManager::OutputUnknowns(OutputCollector *collector) const
+{
+ if (collector) {
+ long translationId = m_source.GetTranslationId();
+ const std::vector<Phrase*> &oovs = GetParser().GetUnknownSources();
+
+ std::ostringstream out;
+ for (std::vector<Phrase*>::const_iterator p = oovs.begin();
+ p != oovs.end(); ++p) {
+ out << *p;
+ }
+ out << std::endl;
+ collector->Write(translationId, out.str());
+ }
+
+}
+
+void ChartManager::OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const
+{
+ const ChartHypothesis *hypo = GetBestHypothesis();
+ if (collector == NULL || hypo == NULL) {
+ return;
+ }
+
+ std::ostringstream out;
+ ApplicationContext applicationContext;
+
+ const Sentence &sentence = dynamic_cast<const Sentence &>(m_source);
+ const size_t translationId = m_source.GetTranslationId();
+
+ OutputTreeFragmentsTranslationOptions(out, applicationContext, hypo, sentence, translationId);
+
+ //Tree of full sentence
+ const StatefulFeatureFunction* treeStructure = StaticData::Instance().GetTreeStructure();
+ if (treeStructure != NULL) {
+ const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for( size_t i=0; i<sff.size(); i++ ) {
+ if (sff[i] == treeStructure) {
+ const TreeState* tree = dynamic_cast<const TreeState*>(hypo->GetFFState(i));
+ out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n";
+ break;
+ }
+ }
+ }
+
+ collector->Write(translationId, out.str());
+
+}
+
+void ChartManager::OutputTreeFragmentsTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const ChartHypothesis *hypo,
+ const Sentence &sentence,
+ long translationId) const
+{
+
+ if (hypo != NULL) {
+ OutputTranslationOption(out, applicationContext, hypo, sentence, translationId);
+
+ const TargetPhrase &currTarPhr = hypo->GetCurrTargetPhrase();
+
+ out << " ||| ";
+ if (const PhraseProperty *property = currTarPhr.GetProperty("Tree")) {
+ out << " " << *property->GetValueString();
+ } else {
+ out << " " << "noTreeInfo";
+ }
+ out << std::endl;
+ }
+
+ // recursive
+ const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+ std::vector<const ChartHypothesis*>::const_iterator iter;
+ for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
+ const ChartHypothesis *prevHypo = *iter;
+ OutputTreeFragmentsTranslationOptions(out, applicationContext, prevHypo, sentence, translationId);
+ }
+}
+
+void ChartManager::OutputSearchGraph(OutputCollector *collector) const
+{
+ if (collector) {
+ long translationId = m_source.GetTranslationId();
+ std::ostringstream out;
+ OutputSearchGraphMoses( out);
+ collector->Write(translationId, out.str());
+ }
+}
+
+//DIMw
+void ChartManager::OutputDetailedAllTranslationReport(
+ OutputCollector *collector,
+ const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList,
+ const Sentence &sentence,
+ long translationId) const
+{
+ std::ostringstream out;
+ ApplicationContext applicationContext;
+
+ const ChartCellCollection& cells = GetChartCellCollection();
+ size_t size = GetSource().GetSize();
+ for (size_t width = 1; width <= size; ++width) {
+ for (size_t startPos = 0; startPos <= size-width; ++startPos) {
+ size_t endPos = startPos + width - 1;
+ WordsRange range(startPos, endPos);
+ const ChartCell& cell = cells.Get(range);
+ const HypoList* hyps = cell.GetAllSortedHypotheses();
+ out << "Chart Cell [" << startPos << ".." << endPos << "]" << endl;
+ HypoList::const_iterator iter;
+ size_t c = 1;
+ for (iter = hyps->begin(); iter != hyps->end(); ++iter) {
+ out << "----------------Item " << c++ << " ---------------------"
+ << endl;
+ OutputTranslationOptions(out, applicationContext, *iter,
+ sentence, translationId);
+ }
+ }
+ }
+ collector->Write(translationId, out.str());
+}
+
+void ChartManager::OutputSearchGraphHypergraph() const
+{
+ const StaticData &staticData = StaticData::Instance();
+ if (staticData.GetOutputSearchGraphHypergraph()) {
+ HypergraphOutput<ChartManager> hypergraphOutputChart(PRECISION);
+ hypergraphOutputChart.Write(*this);
+ }
}
} // namespace Moses
diff --git a/moses/ChartManager.h b/moses/ChartManager.h
index 830138b08..a4f27750e 100644
--- a/moses/ChartManager.h
+++ b/moses/ChartManager.h
@@ -25,12 +25,13 @@
#include <boost/unordered_map.hpp>
#include "ChartCell.h"
#include "ChartCellCollection.h"
-#include "InputType.h"
#include "WordsRange.h"
#include "SentenceStats.h"
#include "ChartTranslationOptionList.h"
#include "ChartParser.h"
#include "ChartKBestExtractor.h"
+#include "BaseManager.h"
+#include "moses/Syntax/KBestExtractor.h"
#include <boost/shared_ptr.hpp>
@@ -38,13 +39,13 @@ namespace Moses
{
class ChartHypothesis;
+class ChartSearchGraphWriter;
/** Holds everything you need to decode 1 sentence with the hierachical/syntax decoder
*/
-class ChartManager
+class ChartManager : public BaseManager
{
private:
- InputType const& m_source; /**< source sentence to be translated */
ChartCellCollection m_hypoStackColl;
std::auto_ptr<SentenceStats> m_sentenceStats;
clock_t m_start; /**< starting time, used for logging */
@@ -54,21 +55,64 @@ private:
ChartTranslationOptionList m_translationOptionList; /**< pre-computed list of translation options for the phrases in this sentence */
+ /* auxilliary functions for SearchGraphs */
+ void FindReachableHypotheses(
+ const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable , size_t* winners, size_t* losers) const;
+ void WriteSearchGraph(const ChartSearchGraphWriter& writer) const;
+
+ // output
+ void OutputNBestList(OutputCollector *collector,
+ const ChartKBestExtractor::KBestVec &nBestList,
+ long translationId) const;
+ size_t CalcSourceSize(const Moses::ChartHypothesis *hypo) const;
+ size_t OutputAlignmentNBest(Alignments &retAlign,
+ const Moses::ChartKBestExtractor::Derivation &derivation,
+ size_t startTarget) const;
+ size_t OutputAlignment(Alignments &retAlign,
+ const Moses::ChartHypothesis *hypo,
+ size_t startTarget) const;
+ void OutputDetailedTranslationReport(
+ OutputCollector *collector,
+ const ChartHypothesis *hypo,
+ const Sentence &sentence,
+ long translationId) const;
+ void OutputTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const ChartHypothesis *hypo,
+ const Sentence &sentence,
+ long translationId) const;
+ void OutputTranslationOption(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const ChartHypothesis *hypo,
+ const Sentence &sentence,
+ long translationId) const;
+ void ReconstructApplicationContext(const ChartHypothesis &hypo,
+ const Sentence &sentence,
+ ApplicationContext &context) const;
+ void OutputTreeFragmentsTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const ChartHypothesis *hypo,
+ const Sentence &sentence,
+ long translationId) const;
+ void OutputDetailedAllTranslationReport(
+ OutputCollector *collector,
+ const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList,
+ const Sentence &sentence,
+ long translationId) const;
+
public:
ChartManager(InputType const& source);
~ChartManager();
- void ProcessSentence();
+ void Decode();
void AddXmlChartOptions();
const ChartHypothesis *GetBestHypothesis() const;
void CalcNBest(size_t n, std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList, bool onlyDistinct=false) const;
- void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
- void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
+ /** "Moses" (osg) type format */
+ void OutputSearchGraphMoses(std::ostream &outputSearchGraphStream) const;
- //! the input sentence being decoded
- const InputType& GetSource() const {
- return m_source;
- }
+ /** Output in (modified) Kenneth hypergraph format */
+ void OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const;
//! debug data collected when decoding sentence
SentenceStats& GetSentenceStats() const {
@@ -96,9 +140,23 @@ public:
return m_hypothesisId++;
}
- const ChartParser &GetParser() const {
- return m_parser;
- }
+ const ChartParser &GetParser() const { return m_parser; }
+
+ // outputs
+ void OutputNBest(OutputCollector *collector) const;
+ void OutputLatticeSamples(OutputCollector *collector) const
+ {}
+ void OutputAlignment(OutputCollector *collector) const;
+ void OutputDetailedTranslationReport(OutputCollector *collector) const;
+ void OutputUnknowns(OutputCollector *collector) const;
+ void OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const;
+ void OutputWordGraph(OutputCollector *collector) const
+ {}
+ void OutputSearchGraph(OutputCollector *collector) const;
+ void OutputSearchGraphSLF() const
+ {}
+ void OutputSearchGraphHypergraph() const;
+
};
}
diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp
index 36bc9476c..a9a1e6aa1 100644
--- a/moses/ChartParser.cpp
+++ b/moses/ChartParser.cpp
@@ -68,6 +68,12 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
m_unksrcs.push_back(unksrc);
+ // hack. Once the OOV FF is a phrase table, get rid of this
+ PhraseDictionary *firstPt = NULL;
+ if (PhraseDictionary::GetColl().size() == 0) {
+ firstPt = PhraseDictionary::GetColl()[0];
+ }
+
//TranslationOption *transOpt;
if (! staticData.GetDropUnknown() || isDigit) {
// loop
@@ -85,7 +91,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");
// add to dictionary
- TargetPhrase *targetPhrase = new TargetPhrase();
+ TargetPhrase *targetPhrase = new TargetPhrase(firstPt);
Word &targetWord = targetPhrase->AddWord();
targetWord.CreateUnknownWord(sourceWord);
@@ -93,11 +99,10 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
float unknownScore = FloorScore(TransformScore(prob));
targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
- targetPhrase->Evaluate(*unksrc);
-
+ targetPhrase->EvaluateInIsolation(*unksrc);
targetPhrase->SetTargetLHS(targetLHS);
targetPhrase->SetAlignmentInfo("0-0");
- if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.GetTreeStructure() != NULL) {
+ if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.PrintNBestTrees() || staticData.GetTreeStructure() != NULL) {
targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]");
}
@@ -108,7 +113,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
// drop source word. create blank trans opt
float unknownScore = FloorScore(-numeric_limits<float>::infinity());
- TargetPhrase *targetPhrase = new TargetPhrase();
+ TargetPhrase *targetPhrase = new TargetPhrase(firstPt);
// loop
const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
UnknownLHSList::const_iterator iterLHS;
@@ -121,7 +126,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");
targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
- targetPhrase->Evaluate(*unksrc);
+ targetPhrase->EvaluateInIsolation(*unksrc);
targetPhrase->SetTargetLHS(targetLHS);
diff --git a/moses/ChartParserCallback.h b/moses/ChartParserCallback.h
index ce4af3ab4..9b03e1f5b 100644
--- a/moses/ChartParserCallback.h
+++ b/moses/ChartParserCallback.h
@@ -25,7 +25,7 @@ public:
virtual void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) = 0;
- virtual void Evaluate(const InputType &input, const InputPath &inputPath) = 0;
+ virtual void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath) = 0;
virtual float GetBestScore(const ChartCellLabel *chartCell) const = 0;
diff --git a/moses/ChartTranslationOption.cpp b/moses/ChartTranslationOption.cpp
index f10097b2e..332b26a15 100644
--- a/moses/ChartTranslationOption.cpp
+++ b/moses/ChartTranslationOption.cpp
@@ -10,15 +10,15 @@ ChartTranslationOption::ChartTranslationOption(const TargetPhrase &targetPhrase)
{
}
-void ChartTranslationOption::Evaluate(const InputType &input,
- const InputPath &inputPath,
- const StackVec &stackVec)
+void ChartTranslationOption::EvaluateWithSourceContext(const InputType &input,
+ const InputPath &inputPath,
+ const StackVec &stackVec)
{
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
for (size_t i = 0; i < ffs.size(); ++i) {
const FeatureFunction &ff = *ffs[i];
- ff.Evaluate(input, inputPath, m_targetPhrase, &stackVec, m_scoreBreakdown);
+ ff.EvaluateWithSourceContext(input, inputPath, m_targetPhrase, &stackVec, m_scoreBreakdown);
}
}
diff --git a/moses/ChartTranslationOption.h b/moses/ChartTranslationOption.h
index bea18e77f..06a6f797e 100644
--- a/moses/ChartTranslationOption.h
+++ b/moses/ChartTranslationOption.h
@@ -44,9 +44,9 @@ public:
return m_scoreBreakdown;
}
- void Evaluate(const InputType &input,
- const InputPath &inputPath,
- const StackVec &stackVec);
+ void EvaluateWithSourceContext(const InputType &input,
+ const InputPath &inputPath,
+ const StackVec &stackVec);
};
}
diff --git a/moses/ChartTranslationOptionList.cpp b/moses/ChartTranslationOptionList.cpp
index 504b03c6c..8d3d9b3ab 100644
--- a/moses/ChartTranslationOptionList.cpp
+++ b/moses/ChartTranslationOptionList.cpp
@@ -168,13 +168,13 @@ float ChartTranslationOptionList::GetBestScore(const ChartCellLabel *chartCell)
return bestHypo.GetTotalScore();
}
-void ChartTranslationOptionList::Evaluate(const InputType &input, const InputPath &inputPath)
+void ChartTranslationOptionList::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
{
// NEVER iterate over ALL of the collection. Just over the first m_size
CollType::iterator iter;
for (iter = m_collection.begin(); iter != m_collection.begin() + m_size; ++iter) {
ChartTranslationOptions &transOpts = **iter;
- transOpts.Evaluate(input, inputPath);
+ transOpts.EvaluateWithSourceContext(input, inputPath);
}
// get rid of empty trans opts
diff --git a/moses/ChartTranslationOptionList.h b/moses/ChartTranslationOptionList.h
index dcf4f1c4e..4723bdd1d 100644
--- a/moses/ChartTranslationOptionList.h
+++ b/moses/ChartTranslationOptionList.h
@@ -65,7 +65,7 @@ public:
void Clear();
void ApplyThreshold();
- void Evaluate(const InputType &input, const InputPath &inputPath);
+ void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
private:
typedef std::vector<ChartTranslationOptions*> CollType;
diff --git a/moses/ChartTranslationOptions.cpp b/moses/ChartTranslationOptions.cpp
index d4d5c5b72..44aa67619 100644
--- a/moses/ChartTranslationOptions.cpp
+++ b/moses/ChartTranslationOptions.cpp
@@ -51,7 +51,7 @@ ChartTranslationOptions::~ChartTranslationOptions()
}
-void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath &inputPath)
+void ChartTranslationOptions::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
{
SetInputPath(&inputPath);
if (StaticData::Instance().GetPlaceholderFactor() != NOT_FOUND) {
@@ -62,7 +62,7 @@ void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath &
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
ChartTranslationOption &transOpt = **iter;
transOpt.SetInputPath(&inputPath);
- transOpt.Evaluate(input, inputPath, m_stackVec);
+ transOpt.EvaluateWithSourceContext(input, inputPath, m_stackVec);
}
// get rid of -inf trans opts
@@ -71,9 +71,10 @@ void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath &
ChartTranslationOption *transOpt = m_collection[i].get();
if (transOpt->GetScores().GetWeightedScore() == - std::numeric_limits<float>::infinity()) {
- ++numDiscard;
- } else if (numDiscard) {
- m_collection[i - numDiscard] = boost::shared_ptr<ChartTranslationOption>(transOpt);
+ ++numDiscard;
+ }
+ else if (numDiscard) {
+ m_collection[i - numDiscard] = m_collection[i];
}
}
@@ -134,12 +135,12 @@ void ChartTranslationOptions::CreateSourceRuleFromInputPath()
std::ostream& operator<<(std::ostream &out, const ChartTranslationOptions &obj)
{
- for (size_t i = 0; i < obj.m_collection.size(); ++i) {
- const ChartTranslationOption &transOpt = *obj.m_collection[i];
- out << transOpt << endl;
- }
+ for (size_t i = 0; i < obj.m_collection.size(); ++i) {
+ const ChartTranslationOption &transOpt = *obj.m_collection[i];
+ out << transOpt << endl;
+ }
- return out;
+ return out;
}
}
diff --git a/moses/ChartTranslationOptions.h b/moses/ChartTranslationOptions.h
index dc610477b..73c378eb0 100644
--- a/moses/ChartTranslationOptions.h
+++ b/moses/ChartTranslationOptions.h
@@ -86,7 +86,7 @@ public:
return m_estimateOfBestScore;
}
- void Evaluate(const InputType &input, const InputPath &inputPath);
+ void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
void SetInputPath(const InputPath *inputPath);
diff --git a/moses/ConfusionNet.cpp b/moses/ConfusionNet.cpp
index d18d78ad6..d9270bd1b 100644
--- a/moses/ConfusionNet.cpp
+++ b/moses/ConfusionNet.cpp
@@ -14,299 +14,299 @@
namespace Moses
{
-struct CNStats {
- size_t created,destr,read,colls,words;
+ struct CNStats {
+ size_t created,destr,read,colls,words;
+
+ CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
+ ~CNStats() {
+ print(std::cerr);
+ }
- CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
- ~CNStats() {
- print(std::cerr);
+ void createOne() {
+ ++created;
+ }
+ void destroyOne() {
+ ++destr;
+ }
+
+ void collect(const ConfusionNet& cn) {
+ ++read;
+ colls+=cn.GetSize();
+ for(size_t i=0; i<cn.GetSize(); ++i)
+ words+=cn[i].size();
+ }
+ void print(std::ostream& out) const {
+ if(created>0) {
+ out<<"confusion net statistics:\n"
+ " created:\t"<<created<<"\n"
+ " destroyed:\t"<<destr<<"\n"
+ " succ. read:\t"<<read<<"\n"
+ " columns:\t"<<colls<<"\n"
+ " words:\t"<<words<<"\n"
+ " avg. word/column:\t"<<words/(1.0*colls)<<"\n"
+ " avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
+ "\n\n";
+ }
+ }
+ };
+
+ CNStats stats;
+
+ size_t
+ ConfusionNet::
+ GetColumnIncrement(size_t i, size_t j) const
+ {
+ (void) i;
+ (void) j;
+ return 1;
}
- void createOne() {
- ++created;
- }
- void destroyOne() {
- ++destr;
- }
+ ConfusionNet::
+ ConfusionNet()
+ : InputType()
+ {
+ stats.createOne();
- void collect(const ConfusionNet& cn) {
- ++read;
- colls+=cn.GetSize();
- for(size_t i=0; i<cn.GetSize(); ++i)
- words+=cn[i].size();
- }
- void print(std::ostream& out) const {
- if(created>0) {
- out<<"confusion net statistics:\n"
- " created:\t"<<created<<"\n"
- " destroyed:\t"<<destr<<"\n"
- " succ. read:\t"<<read<<"\n"
- " columns:\t"<<colls<<"\n"
- " words:\t"<<words<<"\n"
- " avg. word/column:\t"<<words/(1.0*colls)<<"\n"
- " avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
- "\n\n";
+ const StaticData& staticData = StaticData::Instance();
+ if (staticData.IsChart()) {
+ m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
}
+ UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
}
-};
-
-CNStats stats;
-
-size_t
-ConfusionNet::
-GetColumnIncrement(size_t i, size_t j) const
-{
- (void) i;
- (void) j;
- return 1;
-}
-
-ConfusionNet::
-ConfusionNet()
- : InputType()
-{
- stats.createOne();
- const StaticData& staticData = StaticData::Instance();
- if (staticData.IsChart()) {
- m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
+ ConfusionNet::
+ ~ConfusionNet()
+ {
+ stats.destroyOne();
}
- UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
-}
-ConfusionNet::
-~ConfusionNet()
-{
- stats.destroyOne();
-}
-
-ConfusionNet::
-ConfusionNet(Sentence const& s)
-{
- data.resize(s.GetSize());
- for(size_t i=0; i<s.GetSize(); ++i) {
- ScorePair scorePair;
- std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
- data[i].push_back(temp);
+ ConfusionNet::
+ ConfusionNet(Sentence const& s)
+ {
+ data.resize(s.GetSize());
+ for(size_t i=0; i<s.GetSize(); ++i) {
+ ScorePair scorePair;
+ std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
+ data[i].push_back(temp);
+ }
}
-}
-bool
-ConfusionNet::
-ReadF(std::istream& in, const std::vector<FactorType>& factorOrder, int format)
-{
- VERBOSE(2, "read confusion net with format "<<format<<"\n");
- switch(format) {
- case 0:
- return ReadFormat0(in,factorOrder);
- case 1:
- return ReadFormat1(in,factorOrder);
- default:
- std::stringstream strme;
- strme << "ERROR: unknown format '"<<format
- <<"' in ConfusionNet::Read";
- UserMessage::Add(strme.str());
+ bool
+ ConfusionNet::
+ ReadF(std::istream& in, const std::vector<FactorType>& factorOrder, int format)
+ {
+ VERBOSE(2, "read confusion net with format "<<format<<"\n");
+ switch(format) {
+ case 0:
+ return ReadFormat0(in,factorOrder);
+ case 1:
+ return ReadFormat1(in,factorOrder);
+ default:
+ std::stringstream strme;
+ strme << "ERROR: unknown format '"<<format
+ <<"' in ConfusionNet::Read";
+ UserMessage::Add(strme.str());
+ }
+ return false;
}
- return false;
-}
-int
-ConfusionNet::
-Read(std::istream& in,
- const std::vector<FactorType>& factorOrder)
-{
- int rv=ReadF(in,factorOrder,0);
- if(rv) stats.collect(*this);
- return rv;
-}
+ int
+ ConfusionNet::
+ Read(std::istream& in,
+ const std::vector<FactorType>& factorOrder)
+ {
+ int rv=ReadF(in,factorOrder,0);
+ if(rv) stats.collect(*this);
+ return rv;
+ }
#if 0
-// Deprecated due to code duplication;
-// use Word::CreateFromString() instead
-void
-ConfusionNet::
-String2Word(const std::string& s,Word& w,
- const std::vector<FactorType>& factorOrder)
-{
- std::vector<std::string> factorStrVector = Tokenize(s, "|");
- for(size_t i=0; i<factorOrder.size(); ++i)
- w.SetFactor(factorOrder[i],
- FactorCollection::Instance().AddFactor
- (Input,factorOrder[i], factorStrVector[i]));
-}
+ // Deprecated due to code duplication;
+ // use Word::CreateFromString() instead
+ void
+ ConfusionNet::
+ String2Word(const std::string& s,Word& w,
+ const std::vector<FactorType>& factorOrder)
+ {
+ std::vector<std::string> factorStrVector = Tokenize(s, "|");
+ for(size_t i=0; i<factorOrder.size(); ++i)
+ w.SetFactor(factorOrder[i],
+ FactorCollection::Instance().AddFactor
+ (Input,factorOrder[i], factorStrVector[i]));
+ }
#endif
-bool
-ConfusionNet::
-ReadFormat0(std::istream& in, const std::vector<FactorType>& factorOrder)
-{
- Clear();
-
- const StaticData &staticData = StaticData::Instance();
- const InputFeature &inputFeature = InputFeature::Instance();
- size_t numInputScores = inputFeature.GetNumInputScores();
- size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();
-
- size_t totalCount = numInputScores + numRealWordCount;
- bool addRealWordCount = (numRealWordCount > 0);
-
- std::string line;
- while(getline(in,line)) {
- std::istringstream is(line);
- std::string word;
-
- Column col;
- while(is>>word) {
- Word w;
- // String2Word(word,w,factorOrder);
- w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
- std::vector<float> probs(totalCount, 0.0);
- for(size_t i=0; i < numInputScores; i++) {
- double prob;
- if (!(is>>prob)) {
- TRACE_ERR("ERROR: unable to parse CN input - bad link probability, or wrong number of scores\n");
- return false;
- }
- if(prob<0.0) {
- VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
- prob=0.0;
- } else if (prob>1.0) {
- VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
- prob=1.0;
- }
- probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
-
+ bool
+ ConfusionNet::
+ ReadFormat0(std::istream& in, const std::vector<FactorType>& factorOrder)
+ {
+ Clear();
+
+ // const StaticData &staticData = StaticData::Instance();
+ const InputFeature &inputFeature = InputFeature::Instance();
+ size_t numInputScores = inputFeature.GetNumInputScores();
+ size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();
+
+ size_t totalCount = numInputScores + numRealWordCount;
+ bool addRealWordCount = (numRealWordCount > 0);
+
+ std::string line;
+ while(getline(in,line)) {
+ std::istringstream is(line);
+ std::string word;
+
+ Column col;
+ while(is>>word) {
+ Word w;
+ // String2Word(word,w,factorOrder);
+ w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
+ std::vector<float> probs(totalCount, 0.0);
+ for(size_t i=0; i < numInputScores; i++) {
+ double prob;
+ if (!(is>>prob)) {
+ TRACE_ERR("ERROR: unable to parse CN input - bad link probability, or wrong number of scores\n");
+ return false;
+ }
+ if(prob<0.0) {
+ VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
+ prob=0.0;
+ } else if (prob>1.0) {
+ VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
+ prob=1.0;
+ }
+ probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
+
+ }
+ //store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
+ if (addRealWordCount && word!=EPSILON && word!="")
+ probs.back() = -1.0;
+
+ ScorePair scorePair(probs);
+
+ col.push_back(std::make_pair(w,scorePair));
}
- //store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
- if (addRealWordCount && word!=EPSILON && word!="")
- probs.back() = -1.0;
-
- ScorePair scorePair(probs);
-
- col.push_back(std::make_pair(w,scorePair));
+ if(col.size()) {
+ data.push_back(col);
+ ShrinkToFit(data.back());
+ } else break;
}
- if(col.size()) {
- data.push_back(col);
- ShrinkToFit(data.back());
- } else break;
+ return !data.empty();
}
- return !data.empty();
-}
-bool
-ConfusionNet::
-ReadFormat1(std::istream& in, const std::vector<FactorType>& factorOrder)
-{
- Clear();
- std::string line;
- if(!getline(in,line)) return 0;
- size_t s;
- if(getline(in,line)) s=atoi(line.c_str());
- else return 0;
- data.resize(s);
- for(size_t i=0; i<data.size(); ++i) {
+ bool
+ ConfusionNet::
+ ReadFormat1(std::istream& in, const std::vector<FactorType>& factorOrder)
+ {
+ Clear();
+ std::string line;
if(!getline(in,line)) return 0;
- std::istringstream is(line);
- if(!(is>>s)) return 0;
- std::string word;
- double prob;
- data[i].resize(s);
- for(size_t j=0; j<s; ++j)
- if(is>>word>>prob) {
- //TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
- data[i][j].second.denseScores = std::vector<float> (1);
- data[i][j].second.denseScores.push_back((float) log(prob));
- if(data[i][j].second.denseScores[0]<0) {
- VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
- data[i][j].second.denseScores[0]=0.0;
- }
- // String2Word(word,data[i][j].first,factorOrder);
- Word& w = data[i][j].first;
- w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
- } else return 0;
+ size_t s;
+ if(getline(in,line)) s=atoi(line.c_str());
+ else return 0;
+ data.resize(s);
+ for(size_t i=0; i<data.size(); ++i) {
+ if(!getline(in,line)) return 0;
+ std::istringstream is(line);
+ if(!(is>>s)) return 0;
+ std::string word;
+ double prob;
+ data[i].resize(s);
+ for(size_t j=0; j<s; ++j)
+ if(is>>word>>prob) {
+ //TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
+ data[i][j].second.denseScores = std::vector<float> (1);
+ data[i][j].second.denseScores.push_back((float) log(prob));
+ if(data[i][j].second.denseScores[0]<0) {
+ VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
+ data[i][j].second.denseScores[0]=0.0;
+ }
+ // String2Word(word,data[i][j].first,factorOrder);
+ Word& w = data[i][j].first;
+ w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
+ } else return 0;
+ }
+ return !data.empty();
}
- return !data.empty();
-}
-
-void ConfusionNet::Print(std::ostream& out) const
-{
- out<<"conf net: "<<data.size()<<"\n";
- for(size_t i=0; i<data.size(); ++i) {
- out<<i<<" -- ";
- for(size_t j=0; j<data[i].size(); ++j) {
- out<<"("<<data[i][j].first.ToString()<<", ";
-
- // dense
- std::vector<float>::const_iterator iterDense;
- for(iterDense = data[i][j].second.denseScores.begin();
- iterDense < data[i][j].second.denseScores.end();
- ++iterDense) {
- out<<", "<<*iterDense;
- }
- // sparse
- std::map<StringPiece, float>::const_iterator iterSparse;
- for(iterSparse = data[i][j].second.sparseScores.begin();
- iterSparse != data[i][j].second.sparseScores.end();
- ++iterSparse) {
- out << ", " << iterSparse->first << "=" << iterSparse->second;
+ void ConfusionNet::Print(std::ostream& out) const
+ {
+ out<<"conf net: "<<data.size()<<"\n";
+ for(size_t i=0; i<data.size(); ++i) {
+ out<<i<<" -- ";
+ for(size_t j=0; j<data[i].size(); ++j) {
+ out<<"("<<data[i][j].first.ToString()<<", ";
+
+ // dense
+ std::vector<float>::const_iterator iterDense;
+ for(iterDense = data[i][j].second.denseScores.begin();
+ iterDense < data[i][j].second.denseScores.end();
+ ++iterDense) {
+ out<<", "<<*iterDense;
+ }
+
+ // sparse
+ std::map<StringPiece, float>::const_iterator iterSparse;
+ for(iterSparse = data[i][j].second.sparseScores.begin();
+ iterSparse != data[i][j].second.sparseScores.end();
+ ++iterSparse) {
+ out << ", " << iterSparse->first << "=" << iterSparse->second;
+ }
+
+ out<<") ";
}
-
- out<<") ";
+ out<<"\n";
}
- out<<"\n";
+ out<<"\n\n";
}
- out<<"\n\n";
-}
#ifdef _WIN32
#pragma warning(disable:4716)
#endif
-Phrase
-ConfusionNet::
-GetSubString(const WordsRange&) const
-{
- UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
- //return Phrase(Input);
-}
+ Phrase
+ ConfusionNet::
+ GetSubString(const WordsRange&) const
+ {
+ UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
+ //return Phrase(Input);
+ }
-std::string
-ConfusionNet::
-GetStringRep(const std::vector<FactorType> /* factorsToPrint */) const //not well defined yet
-{
- TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
- return "";
-}
+ std::string
+ ConfusionNet::
+ GetStringRep(const std::vector<FactorType> /* factorsToPrint */) const //not well defined yet
+ {
+ TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
+ return "";
+ }
#ifdef _WIN32
#pragma warning(disable:4716)
#endif
-const Word& ConfusionNet::GetWord(size_t) const
-{
- UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
-}
+ const Word& ConfusionNet::GetWord(size_t) const
+ {
+ UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
+ }
#ifdef _WIN32
#pragma warning(default:4716)
#endif
-std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
-{
- cn.Print(out);
- return out;
-}
+ std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
+ {
+ cn.Print(out);
+ return out;
+ }
-TranslationOptionCollection*
-ConfusionNet::
-CreateTranslationOptionCollection() const
-{
- size_t maxNoTransOptPerCoverage
- = StaticData::Instance().GetMaxNoTransOptPerCoverage();
- float translationOptionThreshold
- = StaticData::Instance().GetTranslationOptionThreshold();
- TranslationOptionCollection *rv
- = new TranslationOptionCollectionConfusionNet
- (*this, maxNoTransOptPerCoverage, translationOptionThreshold);
- assert(rv);
- return rv;
-}
+ TranslationOptionCollection*
+ ConfusionNet::
+ CreateTranslationOptionCollection() const
+ {
+ size_t maxNoTransOptPerCoverage
+ = StaticData::Instance().GetMaxNoTransOptPerCoverage();
+ float translationOptionThreshold
+ = StaticData::Instance().GetTranslationOptionThreshold();
+ TranslationOptionCollection *rv
+ = new TranslationOptionCollectionConfusionNet
+ (*this, maxNoTransOptPerCoverage, translationOptionThreshold);
+ assert(rv);
+ return rv;
+ }
}
diff --git a/moses/DecodeGraph.h b/moses/DecodeGraph.h
index dd3100f45..ebb7ef9e1 100644
--- a/moses/DecodeGraph.h
+++ b/moses/DecodeGraph.h
@@ -38,7 +38,7 @@ class DecodeGraph
{
protected:
std::list<const DecodeStep*> m_steps;
- size_t m_position;
+ size_t m_id; // contiguous unique id, starting from 0
size_t m_maxChartSpan;
size_t m_backoff;
@@ -46,15 +46,15 @@ public:
/**
* position: The position of this graph within the decode sequence.
**/
- DecodeGraph(size_t position)
- : m_position(position)
+ DecodeGraph(size_t id)
+ : m_id(id)
, m_maxChartSpan(NOT_FOUND)
, m_backoff(0)
{}
// for chart decoding
- DecodeGraph(size_t position, size_t maxChartSpan)
- : m_position(position)
+ DecodeGraph(size_t id, size_t maxChartSpan)
+ : m_id(id)
, m_maxChartSpan(maxChartSpan) {
}
@@ -90,8 +90,8 @@ public:
m_backoff = backoff;
}
- size_t GetPosition() const {
- return m_position;
+ size_t GetId() const {
+ return m_id;
}
};
diff --git a/moses/DecodeStepGeneration.cpp b/moses/DecodeStepGeneration.cpp
index bcc151bb4..26b95cdb5 100644
--- a/moses/DecodeStepGeneration.cpp
+++ b/moses/DecodeStepGeneration.cpp
@@ -148,7 +148,7 @@ void DecodeStepGeneration::Process(const TranslationOption &inputPartialTranslOp
outPhrase.GetScoreBreakdown().PlusEquals(generationScore);
outPhrase.MergeFactors(genPhrase, m_newOutputFactors);
- outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply);
+ outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply);
const WordsRange &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
diff --git a/moses/DecodeStepTranslation.cpp b/moses/DecodeStepTranslation.cpp
index d8375eb81..e7dbba4f3 100644
--- a/moses/DecodeStepTranslation.cpp
+++ b/moses/DecodeStepTranslation.cpp
@@ -84,8 +84,7 @@ void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslO
}
outPhrase.Merge(targetPhrase, m_newOutputFactors);
- outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
- cerr << "DecodeStepTranslation::Process is calling outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply)" << endl;
+ outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);
assert(newTransOpt != NULL);
@@ -199,11 +198,11 @@ const InputPath &DecodeStepTranslation::GetInputPathLEGACY(
const Word *wordIP = NULL;
for (size_t i = 0; i < phraseFromIP.GetSize(); ++i) {
- const Word &tempWord = phraseFromIP.GetWord(i);
- if (!tempWord.IsEpsilon()) {
- wordIP = &tempWord;
- break;
- }
+ const Word &tempWord = phraseFromIP.GetWord(i);
+ if (!tempWord.IsEpsilon()) {
+ wordIP = &tempWord;
+ break;
+ }
}
// const WordsRange &range = inputPath.GetWordsRange();
@@ -238,7 +237,7 @@ void DecodeStepTranslation::ProcessLEGACY(const TranslationOption &inputPartialT
const size_t tableLimit = phraseDictionary->GetTableLimit();
const TargetPhraseCollectionWithSourcePhrase *phraseColl
- = phraseDictionary->GetTargetPhraseCollectionLEGACY(toc->GetSource(),sourceWordsRange);
+ = phraseDictionary->GetTargetPhraseCollectionLEGACY(toc->GetSource(),sourceWordsRange);
if (phraseColl != NULL) {
@@ -259,7 +258,7 @@ void DecodeStepTranslation::ProcessLEGACY(const TranslationOption &inputPartialT
}
outPhrase.Merge(targetPhrase, m_newOutputFactors);
- outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
+ outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);
diff --git a/moses/FF/BleuScoreFeature.cpp b/moses/FF/BleuScoreFeature.cpp
index 348eaa0ea..0d0a20797 100644
--- a/moses/FF/BleuScoreFeature.cpp
+++ b/moses/FF/BleuScoreFeature.cpp
@@ -502,7 +502,7 @@ void BleuScoreFeature::GetClippedNgramMatchesAndCounts(Phrase& phrase,
* Given a previous state, compute Bleu score for the updated state with an additional target
* phrase translated.
*/
-FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
+FFState* BleuScoreFeature::EvaluateWhenApplied(const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
@@ -563,7 +563,7 @@ FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
return new_state;
}
-FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureID,
+FFState* BleuScoreFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID,
ScoreComponentCollection* accumulator ) const
{
if (!m_enabled) return new BleuScoreState();
diff --git a/moses/FF/BleuScoreFeature.h b/moses/FF/BleuScoreFeature.h
index 1664e48dd..adbb70a98 100644
--- a/moses/FF/BleuScoreFeature.h
+++ b/moses/FF/BleuScoreFeature.h
@@ -115,20 +115,20 @@ public:
std::vector< size_t >&,
size_t skip = 0) const;
- FFState* Evaluate( const Hypothesis& cur_hypo,
+ FFState* EvaluateWhenApplied( const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
+ FFState* EvaluateWhenApplied(const ChartHypothesis& cur_hypo,
int featureID,
ScoreComponentCollection* accumulator) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/ConstrainedDecoding.cpp b/moses/FF/ConstrainedDecoding.cpp
index ddbe1871b..845a23519 100644
--- a/moses/FF/ConstrainedDecoding.cpp
+++ b/moses/FF/ConstrainedDecoding.cpp
@@ -100,7 +100,7 @@ const std::vector<Phrase> *GetConstraint(const std::map<long,std::vector<Phrase>
}
}
-FFState* ConstrainedDecoding::Evaluate(
+FFState* ConstrainedDecoding::EvaluateWhenApplied(
const Hypothesis& hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
@@ -141,7 +141,7 @@ FFState* ConstrainedDecoding::Evaluate(
return ret;
}
-FFState* ConstrainedDecoding::EvaluateChart(
+FFState* ConstrainedDecoding::EvaluateWhenApplied(
const ChartHypothesis &hypo,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
diff --git a/moses/FF/ConstrainedDecoding.h b/moses/FF/ConstrainedDecoding.h
index 1fd206943..ca007f21d 100644
--- a/moses/FF/ConstrainedDecoding.h
+++ b/moses/FF/ConstrainedDecoding.h
@@ -41,26 +41,26 @@ public:
return true;
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
-
- void Evaluate(const InputType &input
+
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
-
- FFState* Evaluate(
+
+ FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(
+ FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
diff --git a/moses/FF/ControlRecombination.cpp b/moses/FF/ControlRecombination.cpp
index e09d6a7fa..f7231d9b0 100644
--- a/moses/FF/ControlRecombination.cpp
+++ b/moses/FF/ControlRecombination.cpp
@@ -56,7 +56,7 @@ std::vector<float> ControlRecombination::DefaultWeights() const
return ret;
}
-FFState* ControlRecombination::Evaluate(
+FFState* ControlRecombination::EvaluateWhenApplied(
const Hypothesis& hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
@@ -64,7 +64,7 @@ FFState* ControlRecombination::Evaluate(
return new ControlRecombinationState(hypo, *this);
}
-FFState* ControlRecombination::EvaluateChart(
+FFState* ControlRecombination::EvaluateWhenApplied(
const ChartHypothesis &hypo,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
diff --git a/moses/FF/ControlRecombination.h b/moses/FF/ControlRecombination.h
index 0100d500d..095cc6b29 100644
--- a/moses/FF/ControlRecombination.h
+++ b/moses/FF/ControlRecombination.h
@@ -57,24 +57,24 @@ public:
return true;
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- FFState* Evaluate(
+ FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(
+ FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
diff --git a/moses/FF/CountNonTerms.cpp b/moses/FF/CountNonTerms.cpp
index 5f876e9e4..03c7b7315 100644
--- a/moses/FF/CountNonTerms.cpp
+++ b/moses/FF/CountNonTerms.cpp
@@ -8,18 +8,18 @@ using namespace std;
namespace Moses
{
CountNonTerms::CountNonTerms(const std::string &line)
- :StatelessFeatureFunction(line)
- ,m_all(true)
- ,m_sourceSyntax(false)
- ,m_targetSyntax(false)
+:StatelessFeatureFunction(line)
+,m_all(true)
+,m_sourceSyntax(false)
+,m_targetSyntax(false)
{
ReadParameters();
}
-void CountNonTerms::Evaluate(const Phrase &sourcePhrase
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+void CountNonTerms::EvaluateInIsolation(const Phrase &sourcePhrase
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
const StaticData &staticData = StaticData::Instance();
@@ -27,33 +27,33 @@ void CountNonTerms::Evaluate(const Phrase &sourcePhrase
size_t indScore = 0;
if (m_all) {
- for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
- const Word &word = targetPhrase.GetWord(i);
- if (word.IsNonTerminal()) {
- ++scores[indScore];
- }
- }
- ++indScore;
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
+ const Word &word = targetPhrase.GetWord(i);
+ if (word.IsNonTerminal()) {
+ ++scores[indScore];
+ }
+ }
+ ++indScore;
}
if (m_targetSyntax) {
- for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
- const Word &word = targetPhrase.GetWord(i);
- if (word.IsNonTerminal() && word != staticData.GetOutputDefaultNonTerminal()) {
- ++scores[indScore];
- }
- }
- ++indScore;
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
+ const Word &word = targetPhrase.GetWord(i);
+ if (word.IsNonTerminal() && word != staticData.GetOutputDefaultNonTerminal()) {
+ ++scores[indScore];
+ }
+ }
+ ++indScore;
}
if (m_sourceSyntax) {
- for (size_t i = 0; i < sourcePhrase.GetSize(); ++i) {
- const Word &word = sourcePhrase.GetWord(i);
- if (word.IsNonTerminal() && word != staticData.GetInputDefaultNonTerminal()) {
- ++scores[indScore];
- }
- }
- ++indScore;
+ for (size_t i = 0; i < sourcePhrase.GetSize(); ++i) {
+ const Word &word = sourcePhrase.GetWord(i);
+ if (word.IsNonTerminal() && word != staticData.GetInputDefaultNonTerminal()) {
+ ++scores[indScore];
+ }
+ }
+ ++indScore;
}
scoreBreakdown.PlusEquals(this, scores);
@@ -64,9 +64,9 @@ void CountNonTerms::SetParameter(const std::string& key, const std::string& valu
if (key == "all") {
m_all = Scan<bool>(value);
} else if (key == "source-syntax") {
- m_sourceSyntax = Scan<bool>(value);
+ m_sourceSyntax = Scan<bool>(value);
} else if (key == "target-syntax") {
- m_targetSyntax = Scan<bool>(value);
+ m_targetSyntax = Scan<bool>(value);
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
diff --git a/moses/FF/CountNonTerms.h b/moses/FF/CountNonTerms.h
index 7e5865c3c..2e0a528a4 100644
--- a/moses/FF/CountNonTerms.h
+++ b/moses/FF/CountNonTerms.h
@@ -13,12 +13,12 @@ public:
return true;
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
@@ -26,11 +26,11 @@ public:
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(
+ void EvaluateWhenApplied(
const ChartHypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
diff --git a/moses/FF/CoveredReferenceFeature.cpp b/moses/FF/CoveredReferenceFeature.cpp
index 57217e062..3a2482d0d 100644
--- a/moses/FF/CoveredReferenceFeature.cpp
+++ b/moses/FF/CoveredReferenceFeature.cpp
@@ -22,44 +22,44 @@ int CoveredReferenceState::Compare(const FFState& other) const
const CoveredReferenceState &otherState = static_cast<const CoveredReferenceState&>(other);
if (m_coveredRef.size() != otherState.m_coveredRef.size()) {
- return (m_coveredRef.size() < otherState.m_coveredRef.size()) ? -1 : +1;
+ return (m_coveredRef.size() < otherState.m_coveredRef.size()) ? -1 : +1;
} else {
multiset<string>::const_iterator thisIt, otherIt;
for (thisIt = m_coveredRef.begin(), otherIt = otherState.m_coveredRef.begin();
- thisIt != m_coveredRef.end();
- thisIt++, otherIt++) {
+ thisIt != m_coveredRef.end();
+ thisIt++, otherIt++) {
if (*thisIt != *otherIt) return thisIt->compare(*otherIt);
}
}
return 0;
// return m_coveredRef == otherState.m_coveredRef;
-
+
// if (m_coveredRef == otherState.m_coveredRef)
// return 0;
// return (m_coveredRef.size() < otherState.m_coveredRef.size()) ? -1 : +1;
}
-void CoveredReferenceFeature::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+void CoveredReferenceFeature::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{}
-void CoveredReferenceFeature::Evaluate(const InputType &input
- , const InputPath &inputPath
- , const TargetPhrase &targetPhrase
- , const StackVec *stackVec
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore) const
+void CoveredReferenceFeature::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
{
long id = input.GetTranslationId();
boost::unordered_map<long, std::multiset<string> >::const_iterator refIt = m_refs.find(id);
multiset<string> wordsInPhrase = GetWordsInPhrase(targetPhrase);
multiset<string> covered;
set_intersection(wordsInPhrase.begin(), wordsInPhrase.end(),
- refIt->second.begin(), refIt->second.end(),
- inserter(covered, covered.begin()));
+ refIt->second.begin(), refIt->second.end(),
+ inserter(covered, covered.begin()));
vector<float> scores;
scores.push_back(covered.size());
@@ -67,8 +67,7 @@ void CoveredReferenceFeature::Evaluate(const InputType &input
estimatedFutureScore->Assign(this, scores);
}
-void CoveredReferenceFeature::Load()
-{
+void CoveredReferenceFeature::Load() {
InputFileStream refFile(m_path);
std::string line;
const StaticData &staticData = StaticData::Instance();
@@ -77,7 +76,7 @@ void CoveredReferenceFeature::Load()
vector<string> words = Tokenize(line, " ");
multiset<string> wordSet;
// TODO make Tokenize work with other containers than vector
- copy(words.begin(), words.end(), inserter(wordSet, wordSet.begin()));
+ copy(words.begin(), words.end(), inserter(wordSet, wordSet.begin()));
m_refs.insert(make_pair(sentenceID++, wordSet));
}
}
@@ -91,7 +90,7 @@ void CoveredReferenceFeature::SetParameter(const std::string& key, const std::st
}
}
-FFState* CoveredReferenceFeature::Evaluate(
+FFState* CoveredReferenceFeature::EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
@@ -108,15 +107,15 @@ FFState* CoveredReferenceFeature::Evaluate(
boost::unordered_map<long, std::multiset<string> >::const_iterator refIt = m_refs.find(id);
if (refIt == m_refs.end()) UTIL_THROW(util::Exception, "Sentence id out of range: " + SPrint<long>(id));
set_difference(refIt->second.begin(), refIt->second.end(),
- ret->m_coveredRef.begin(), ret->m_coveredRef.end(),
- inserter(remaining, remaining.begin()));
+ ret->m_coveredRef.begin(), ret->m_coveredRef.end(),
+ inserter(remaining, remaining.begin()));
// which of the remaining words are present in the current phrase
multiset<string> wordsInPhrase = GetWordsInPhrase(cur_hypo.GetCurrTargetPhrase());
multiset<string> newCovered;
set_intersection(wordsInPhrase.begin(), wordsInPhrase.end(),
- remaining.begin(), remaining.end(),
- inserter(newCovered, newCovered.begin()));
+ remaining.begin(), remaining.end(),
+ inserter(newCovered, newCovered.begin()));
vector<float> estimateScore =
cur_hypo.GetCurrTargetPhrase().GetScoreBreakdown().GetScoresForProducer(this);
@@ -132,7 +131,7 @@ FFState* CoveredReferenceFeature::Evaluate(
return ret;
}
-FFState* CoveredReferenceFeature::EvaluateChart(
+FFState* CoveredReferenceFeature::EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
diff --git a/moses/FF/CoveredReferenceFeature.h b/moses/FF/CoveredReferenceFeature.h
index 6f789d640..652cbf41e 100644
--- a/moses/FF/CoveredReferenceFeature.h
+++ b/moses/FF/CoveredReferenceFeature.h
@@ -51,21 +51,21 @@ public:
return new CoveredReferenceState();
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
- FFState* Evaluate(
+ FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(
+ FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
diff --git a/moses/FF/DecodeFeature.cpp b/moses/FF/DecodeFeature.cpp
index c6f331314..11c8653ea 100644
--- a/moses/FF/DecodeFeature.cpp
+++ b/moses/FF/DecodeFeature.cpp
@@ -50,8 +50,8 @@ DecodeFeature::DecodeFeature(size_t numScoreComponents
, const std::vector<FactorType> &output
, const std::string &line)
: StatelessFeatureFunction(numScoreComponents, line)
- , m_container(NULL)
, m_input(input), m_output(output)
+ , m_container(NULL)
{
m_inputFactors = FactorMask(input);
m_outputFactors = FactorMask(output);
diff --git a/moses/FF/DecodeFeature.h b/moses/FF/DecodeFeature.h
index bdda7edf8..cac55a150 100644
--- a/moses/FF/DecodeFeature.h
+++ b/moses/FF/DecodeFeature.h
@@ -62,20 +62,23 @@ public:
bool IsUseable(const FactorMask &mask) const;
void SetParameter(const std::string& key, const std::string& value);
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
- void Evaluate(const InputType &input
+ void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
+ ScoreComponentCollection* accumulator) const
+ {}
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/DistortionScoreProducer.cpp b/moses/FF/DistortionScoreProducer.cpp
index 303f35236..5995fe213 100644
--- a/moses/FF/DistortionScoreProducer.cpp
+++ b/moses/FF/DistortionScoreProducer.cpp
@@ -87,7 +87,7 @@ float DistortionScoreProducer::CalculateDistortionScore(const Hypothesis& hypo,
}
-FFState* DistortionScoreProducer::Evaluate(
+FFState* DistortionScoreProducer::EvaluateWhenApplied(
const Hypothesis& hypo,
const FFState* prev_state,
ScoreComponentCollection* out) const
diff --git a/moses/FF/DistortionScoreProducer.h b/moses/FF/DistortionScoreProducer.h
index 1bc6493e2..aa2c18b95 100644
--- a/moses/FF/DistortionScoreProducer.h
+++ b/moses/FF/DistortionScoreProducer.h
@@ -28,26 +28,26 @@ public:
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
- virtual FFState* Evaluate(
+ virtual FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- virtual FFState* EvaluateChart(
+ virtual FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection*) const {
throw std::logic_error("DistortionScoreProducer not supported in chart decoder, yet");
}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/DynamicCacheBasedLanguageModel.cpp b/moses/FF/DynamicCacheBasedLanguageModel.cpp
index e794e84d2..f4e98b3c5 100644
--- a/moses/FF/DynamicCacheBasedLanguageModel.cpp
+++ b/moses/FF/DynamicCacheBasedLanguageModel.cpp
@@ -85,7 +85,7 @@ void DynamicCacheBasedLanguageModel::SetParameter(const std::string& key, const
}
}
-void DynamicCacheBasedLanguageModel::Evaluate(const Phrase &sp
+void DynamicCacheBasedLanguageModel::EvaluateInIsolation(const Phrase &sp
, const TargetPhrase &tp
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/DynamicCacheBasedLanguageModel.h b/moses/FF/DynamicCacheBasedLanguageModel.h
index 6af0a8ac7..422b18511 100644
--- a/moses/FF/DynamicCacheBasedLanguageModel.h
+++ b/moses/FF/DynamicCacheBasedLanguageModel.h
@@ -74,11 +74,9 @@ class DynamicCacheBasedLanguageModel : public StatelessFeatureFunction
void Insert(std::vector<std::string> ngrams);
- void Evaluate( const TargetPhrase&, ScoreComponentCollection* ) const;
-
+// void EvaluateInIsolation(const Phrase&, const TargetPhrase&, ScoreComponentCollection&, ScoreComponentCollection& ) const;
void Print() const;
-
protected:
static DynamicCacheBasedLanguageModel *s_instance;
static std::map< const std::string, DynamicCacheBasedLanguageModel * > s_instance_map;
@@ -124,43 +122,30 @@ public:
void Insert(std::string &entries);
void Clear();
- virtual void Evaluate(const Phrase &source
+ virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- virtual void Evaluate(const InputType &input
- , const InputPath &inputPath
- , const TargetPhrase &targetPhrase
- , const StackVec *stackVec
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore = NULL) const
- {}
-
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
- void Evaluate(const InputType &input
- , const InputPath &inputPath
- , ScoreComponentCollection &scoreBreakdown) const
- {}
-
void SetQueryType(size_t type);
void SetScoreType(size_t type);
void SetMaxAge(unsigned int age);
-
};
}
diff --git a/moses/FF/ExternalFeature.cpp b/moses/FF/ExternalFeature.cpp
index 141541170..10800d24d 100644
--- a/moses/FF/ExternalFeature.cpp
+++ b/moses/FF/ExternalFeature.cpp
@@ -51,7 +51,7 @@ void ExternalFeature::SetParameter(const std::string& key, const std::string& va
}
}
-FFState* ExternalFeature::Evaluate(
+FFState* ExternalFeature::EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
@@ -59,7 +59,7 @@ FFState* ExternalFeature::Evaluate(
return new ExternalFeatureState(m_stateSize);
}
-FFState* ExternalFeature::EvaluateChart(
+FFState* ExternalFeature::EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
diff --git a/moses/FF/ExternalFeature.h b/moses/FF/ExternalFeature.h
index 19eb45f2a..888fef951 100644
--- a/moses/FF/ExternalFeature.h
+++ b/moses/FF/ExternalFeature.h
@@ -51,24 +51,24 @@ public:
void SetParameter(const std::string& key, const std::string& value);
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- FFState* Evaluate(
+ FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(
+ FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
@@ -87,6 +87,7 @@ protected:
class CdecFF
{
public:
+ virtual ~CdecFF() {}
virtual int StateSize() const = 0;
};
diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp
index 9d1637b46..73f1ada0f 100644
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@@ -2,16 +2,18 @@
#include "moses/StaticData.h"
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
-#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModel.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
-#include "moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h"
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
#include "moses/TranslationModel/PhraseDictionaryScope3.h"
#include "moses/TranslationModel/PhraseDictionaryTransliteration.h"
#include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h"
+#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
+#include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h"
+#include "moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h"
+
#include "moses/FF/LexicalReordering/LexicalReordering.h"
#include "moses/FF/BleuScoreFeature.h"
@@ -27,6 +29,7 @@
#include "moses/FF/PhrasePairFeature.h"
#include "moses/FF/PhraseLengthFeature.h"
#include "moses/FF/DistortionScoreProducer.h"
+#include "moses/FF/SparseHieroReorderingFeature.h"
#include "moses/FF/WordPenaltyProducer.h"
#include "moses/FF/InputFeature.h"
#include "moses/FF/PhrasePenalty.h"
@@ -34,22 +37,30 @@
#include "moses/FF/ControlRecombination.h"
#include "moses/FF/ExternalFeature.h"
#include "moses/FF/ConstrainedDecoding.h"
+#include "moses/FF/SoftSourceSyntacticConstraintsFeature.h"
#include "moses/FF/CoveredReferenceFeature.h"
#include "moses/FF/TreeStructureFeature.h"
#include "moses/FF/SoftMatchingFeature.h"
#include "moses/FF/DynamicCacheBasedLanguageModel.h"
+#include "moses/FF/SourceGHKMTreeInputMatchFeature.h"
#include "moses/FF/HyperParameterAsWeight.h"
#include "moses/FF/SetSourcePhrase.h"
+#include "moses/FF/PhraseOrientationFeature.h"
#include "CountNonTerms.h"
#include "ReferenceComparison.h"
#include "RuleScope.h"
#include "MaxSpanFreeNonTermSource.h"
#include "NieceTerminal.h"
+#include "SpanLength.h"
+#include "SyntaxRHS.h"
#include "moses/FF/SkeletonStatelessFF.h"
#include "moses/FF/SkeletonStatefulFF.h"
#include "moses/LM/SkeletonLM.h"
+#include "moses/LM/BilingualLM.h"
+#include "SkeletonChangeInput.h"
#include "moses/TranslationModel/SkeletonPT.h"
+#include "moses/Syntax/RuleTableFF.h"
#ifdef HAVE_CMPH
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
@@ -57,6 +68,9 @@
#ifdef PT_UG
#include "moses/TranslationModel/UG/mmsapt.h"
#endif
+#ifdef HAVE_PROBINGPT
+#include "moses/TranslationModel/ProbingPT/ProbingPT.h"
+#endif
#include "moses/LM/Ken.h"
#ifdef LM_IRST
@@ -81,12 +95,18 @@
#ifdef LM_NEURAL
#include "moses/LM/NeuralLMWrapper.h"
+#include "moses/LM/bilingual-lm/BiLM_NPLM.h"
#endif
#ifdef LM_DALM
#include "moses/LM/DALMWrapper.h"
#endif
+#ifdef LM_OXLM
+#include "moses/LM/oxlm/OxLM.h"
+#include "moses/LM/oxlm/SourceOxLM.h"
+#endif
+
#include "util/exception.hh"
#include <vector>
@@ -150,6 +170,20 @@ FeatureRegistry::FeatureRegistry()
#define MOSES_FNAME(name) Add(#name, new DefaultFeatureFactory< name >());
// Feature with different name than class.
#define MOSES_FNAME2(name, type) Add(name, new DefaultFeatureFactory< type >());
+
+ MOSES_FNAME2("PhraseDictionaryBinary", PhraseDictionaryTreeAdaptor);
+ MOSES_FNAME(PhraseDictionaryOnDisk);
+ MOSES_FNAME(PhraseDictionaryMemory);
+ MOSES_FNAME(PhraseDictionaryScope3);
+ MOSES_FNAME(PhraseDictionaryMultiModel);
+ MOSES_FNAME(PhraseDictionaryMultiModelCounts);
+ MOSES_FNAME(PhraseDictionaryALSuffixArray);
+ MOSES_FNAME(PhraseDictionaryDynSuffixArray);
+ MOSES_FNAME(PhraseDictionaryTransliteration);
+ MOSES_FNAME(PhraseDictionaryDynamicCacheBased);
+ MOSES_FNAME(PhraseDictionaryFuzzyMatch);
+ MOSES_FNAME2("RuleTable", Syntax::RuleTableFF);
+
MOSES_FNAME(GlobalLexicalModel);
//MOSES_FNAME(GlobalLexicalModelUnlimited); This was commented out in the original
MOSES_FNAME(SourceWordDeletionFeature);
@@ -166,16 +200,6 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME2("Distortion", DistortionScoreProducer);
MOSES_FNAME2("WordPenalty", WordPenaltyProducer);
MOSES_FNAME(InputFeature);
- MOSES_FNAME2("PhraseDictionaryBinary", PhraseDictionaryTreeAdaptor);
- MOSES_FNAME(PhraseDictionaryOnDisk);
- MOSES_FNAME(PhraseDictionaryMemory);
- MOSES_FNAME(PhraseDictionaryScope3);
- MOSES_FNAME(PhraseDictionaryMultiModel);
- MOSES_FNAME(PhraseDictionaryMultiModelCounts);
- MOSES_FNAME(PhraseDictionaryALSuffixArray);
- MOSES_FNAME(PhraseDictionaryDynSuffixArray);
- MOSES_FNAME(PhraseDictionaryDynamicCacheBased);
- MOSES_FNAME(PhraseDictionaryTransliteration);
MOSES_FNAME(OpSequenceModel);
MOSES_FNAME(PhrasePenalty);
MOSES_FNAME2("UnknownWordPenalty", UnknownWordPenaltyProducer);
@@ -183,6 +207,8 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(ConstrainedDecoding);
MOSES_FNAME(CoveredReferenceFeature);
MOSES_FNAME(ExternalFeature);
+ MOSES_FNAME(SourceGHKMTreeInputMatchFeature);
+ MOSES_FNAME(SoftSourceSyntacticConstraintsFeature);
MOSES_FNAME(TreeStructureFeature);
MOSES_FNAME(SoftMatchingFeature);
MOSES_FNAME(DynamicCacheBasedLanguageModel);
@@ -193,10 +219,15 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(RuleScope);
MOSES_FNAME(MaxSpanFreeNonTermSource);
MOSES_FNAME(NieceTerminal);
+ MOSES_FNAME(SparseHieroReorderingFeature);
+ MOSES_FNAME(SpanLength);
+ MOSES_FNAME(SyntaxRHS);
+ MOSES_FNAME(PhraseOrientationFeature);
MOSES_FNAME(SkeletonStatelessFF);
MOSES_FNAME(SkeletonStatefulFF);
MOSES_FNAME(SkeletonLM);
+ MOSES_FNAME(SkeletonChangeInput);
MOSES_FNAME(SkeletonPT);
#ifdef HAVE_CMPH
@@ -204,7 +235,12 @@ FeatureRegistry::FeatureRegistry()
#endif
#ifdef PT_UG
MOSES_FNAME(Mmsapt);
+ MOSES_FNAME2("PhraseDictionaryBitextSampling",Mmsapt); // that's an alias for Mmsapt!
#endif
+#ifdef HAVE_PROBINGPT
+ MOSES_FNAME(ProbingPT);
+#endif
+
#ifdef HAVE_SYNLM
MOSES_FNAME(SyntacticLanguageModel);
#endif
@@ -222,10 +258,18 @@ FeatureRegistry::FeatureRegistry()
#endif
#ifdef LM_NEURAL
MOSES_FNAME2("NeuralLM", NeuralLMWrapper);
+ MOSES_FNAME2("BilingualNPLM", BilingualLM_NPLM);
#endif
#ifdef LM_DALM
MOSES_FNAME2("DALM", LanguageModelDALM);
#endif
+#ifdef LM_OXLM
+ MOSES_FNAME2("OxLM", OxLM<oxlm::LM>);
+ MOSES_FNAME2("OxFactoredLM", OxLM<oxlm::FactoredLM>);
+ MOSES_FNAME2("OxFactoredMaxentLM", OxLM<oxlm::FactoredMaxentLM>);
+ MOSES_FNAME2("OxSourceFactoredLM", SourceOxLM);
+ MOSES_FNAME2("OxTreeLM", OxLM<oxlm::FactoredTreeLM>);
+#endif
Add("KENLM", new KenFactory());
}
@@ -254,13 +298,22 @@ void FeatureRegistry::Construct(const std::string &name, const std::string &line
void FeatureRegistry::PrintFF() const
{
- std::cerr << "Available feature functions:" << std::endl;
- Map::const_iterator iter;
- for (iter = registry_.begin(); iter != registry_.end(); ++iter) {
- const string &ffName = iter->first;
- std::cerr << ffName << " ";
- }
- std::cerr << std::endl;
+ vector<string> ffs;
+ std::cerr << "Available feature functions:" << std::endl;
+ Map::const_iterator iter;
+ for (iter = registry_.begin(); iter != registry_.end(); ++iter) {
+ const string &ffName = iter->first;
+ ffs.push_back(ffName);
+ }
+
+ vector<string>::const_iterator iterVec;
+ std::sort(ffs.begin(), ffs.end());
+ for (iterVec = ffs.begin(); iterVec != ffs.end(); ++iterVec) {
+ const string &ffName = *iterVec;
+ std::cerr << ffName << " ";
+ }
+
+ std::cerr << std::endl;
}
} // namespace Moses
diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp
index 90d198680..22eacd271 100644
--- a/moses/FF/FeatureFunction.cpp
+++ b/moses/FF/FeatureFunction.cpp
@@ -34,9 +34,18 @@ void FeatureFunction::Destroy()
RemoveAllInColl(s_staticColl);
}
+void FeatureFunction::CallChangeSource(InputType *&input)
+{
+ for (size_t i = 0; i < s_staticColl.size(); ++i) {
+ const FeatureFunction &ff = *s_staticColl[i];
+ ff.ChangeSource(input);
+ }
+}
+
FeatureFunction::
FeatureFunction(const std::string& line)
: m_tuneable(true)
+ , m_verbosity(std::numeric_limits<std::size_t>::max())
, m_numScoreComponents(1)
{
Initialize(line);
@@ -46,6 +55,7 @@ FeatureFunction::
FeatureFunction(size_t numScoreComponents,
const std::string& line)
: m_tuneable(true)
+ , m_verbosity(std::numeric_limits<std::size_t>::max())
, m_numScoreComponents(numScoreComponents)
{
Initialize(line);
@@ -107,6 +117,8 @@ void FeatureFunction::SetParameter(const std::string& key, const std::string& va
{
if (key == "tuneable") {
m_tuneable = Scan<bool>(value);
+ } else if (key == "verbosity") {
+ m_verbosity = Scan<size_t>(value);
} else if (key == "filterable") { //ignore
} else {
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
diff --git a/moses/FF/FeatureFunction.h b/moses/FF/FeatureFunction.h
index 18b016c8f..115797228 100644
--- a/moses/FF/FeatureFunction.h
+++ b/moses/FF/FeatureFunction.h
@@ -4,6 +4,7 @@
#include <vector>
#include <set>
#include <string>
+#include "moses/FeatureVector.h"
#include "moses/TypeDef.h"
namespace Moses
@@ -33,6 +34,7 @@ protected:
std::string m_description, m_argLine;
std::vector<std::vector<std::string> > m_args;
bool m_tuneable;
+ size_t m_verbosity;
size_t m_numScoreComponents;
//In case there's multiple producers with the same description
static std::multiset<std::string> description_counts;
@@ -47,6 +49,8 @@ public:
static FeatureFunction &FindFeatureFunction(const std::string& name);
static void Destroy();
+ static void CallChangeSource(InputType *&input);
+
FeatureFunction(const std::string &line);
FeatureFunction(size_t numScoreComponents, const std::string &line);
virtual bool IsStateless() const = 0;
@@ -71,6 +75,11 @@ public:
return m_description;
}
+ FName GetFeatureName(const std::string& name) const {
+ return FName(GetScoreProducerDescription(), name);
+ }
+
+
//! if false, then this feature is not displayed in the n-best list.
// use with care
virtual bool IsTuneable() const {
@@ -98,11 +107,15 @@ public:
// source phrase is the substring that the phrase table uses to look up the target phrase,
// may have more factors than actually need, but not guaranteed.
// For SCFG decoding, the source contains non-terminals, NOT the raw source from the input sentence
- virtual void Evaluate(const Phrase &source
+ virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const = 0;
+ // override this method if you want to change the input before decoding
+ virtual void ChangeSource(InputType *&input) const
+ {}
+
// This method is called once all the translation options are retrieved from the phrase table, and
// just before search.
// 'inputPath' is guaranteed to be the raw substring from the input. No factors were added or taken away
@@ -110,7 +123,7 @@ public:
// It is guaranteed to be in the same order as the non-terms in the source phrase.
// For pb models, stackvec is NULL.
// No FF should set estimatedFutureScore in both overloads!
- virtual void Evaluate(const InputType &input
+ virtual void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
diff --git a/moses/FF/GlobalLexicalModel.cpp b/moses/FF/GlobalLexicalModel.cpp
index ff9e87bb0..5c603bc51 100644
--- a/moses/FF/GlobalLexicalModel.cpp
+++ b/moses/FF/GlobalLexicalModel.cpp
@@ -165,12 +165,12 @@ float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetP
return score;
}
-void GlobalLexicalModel::Evaluate
-(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
+void GlobalLexicalModel::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
- accumulator->PlusEquals( this,
- GetFromCacheOrScorePhrase(hypo.GetCurrTargetPhrase()) );
+ scoreBreakdown.PlusEquals( this, GetFromCacheOrScorePhrase(targetPhrase) );
}
bool GlobalLexicalModel::IsUseable(const FactorMask &mask) const
diff --git a/moses/FF/GlobalLexicalModel.h b/moses/FF/GlobalLexicalModel.h
index 664835df5..65b5cf2b8 100644
--- a/moses/FF/GlobalLexicalModel.h
+++ b/moses/FF/GlobalLexicalModel.h
@@ -70,29 +70,25 @@ public:
bool IsUseable(const FactorMask &mask) const;
- void Evaluate(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const;
-
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
- void EvaluateChart(
- const ChartHypothesis& hypo,
- ScoreComponentCollection* accumulator) const {
- throw std::logic_error("GlobalLexicalModel not supported in chart decoder, yet");
- }
+ void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
- {}
-
};
diff --git a/moses/FF/GlobalLexicalModelUnlimited.cpp b/moses/FF/GlobalLexicalModelUnlimited.cpp
index 56bd011c1..3f1812df9 100644
--- a/moses/FF/GlobalLexicalModelUnlimited.cpp
+++ b/moses/FF/GlobalLexicalModelUnlimited.cpp
@@ -108,7 +108,7 @@ void GlobalLexicalModelUnlimited::InitializeForInput( Sentence const& in )
m_local->input = &in;
}
-void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComponentCollection* accumulator) const
+void GlobalLexicalModelUnlimited::EvaluateWhenApplied(const Hypothesis& cur_hypo, ScoreComponentCollection* accumulator) const
{
const Sentence& input = *(m_local->input);
const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
diff --git a/moses/FF/GlobalLexicalModelUnlimited.h b/moses/FF/GlobalLexicalModelUnlimited.h
index f12df7d61..096254613 100644
--- a/moses/FF/GlobalLexicalModelUnlimited.h
+++ b/moses/FF/GlobalLexicalModelUnlimited.h
@@ -81,23 +81,23 @@ public:
//TODO: This implements the old interface, but cannot be updated because
//it appears to be stateful
- void Evaluate(const Hypothesis& cur_hypo,
+ void EvaluateWhenApplied(const Hypothesis& cur_hypo,
ScoreComponentCollection* accumulator) const;
- void EvaluateChart(const ChartHypothesis& /* cur_hypo */,
+ void EvaluateWhenApplied(const ChartHypothesis& /* cur_hypo */,
int /* featureID */,
ScoreComponentCollection* ) const {
throw std::logic_error("GlobalLexicalModelUnlimited not supported in chart decoder, yet");
}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/HyperParameterAsWeight.h b/moses/FF/HyperParameterAsWeight.h
index e8979639e..5838bf2fd 100644
--- a/moses/FF/HyperParameterAsWeight.h
+++ b/moses/FF/HyperParameterAsWeight.h
@@ -18,13 +18,13 @@ public:
return true;
}
- virtual void Evaluate(const Phrase &source
+ virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
- virtual void Evaluate(const InputType &input
+ virtual void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
@@ -32,14 +32,14 @@ public:
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- virtual void Evaluate(const Hypothesis& hypo,
+ virtual void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
/**
* Same for chart-based features.
**/
- virtual void EvaluateChart(const ChartHypothesis &hypo,
+ virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
diff --git a/moses/FF/InputFeature.cpp b/moses/FF/InputFeature.cpp
index 2352847e8..3ce6a9190 100644
--- a/moses/FF/InputFeature.cpp
+++ b/moses/FF/InputFeature.cpp
@@ -44,7 +44,7 @@ void InputFeature::SetParameter(const std::string& key, const std::string& value
}
-void InputFeature::Evaluate(const InputType &input
+void InputFeature::EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
diff --git a/moses/FF/InputFeature.h b/moses/FF/InputFeature.h
index e4b1a8d99..ad4fe398a 100644
--- a/moses/FF/InputFeature.h
+++ b/moses/FF/InputFeature.h
@@ -41,22 +41,23 @@ public:
return m_numRealWordCount;
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
- void Evaluate(const InputType &input
+
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
diff --git a/moses/FF/InternalStructStatelessFF.cpp b/moses/FF/InternalStructStatelessFF.cpp
deleted file mode 100644
index 3245fab94..000000000
--- a/moses/FF/InternalStructStatelessFF.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-#include "InternalStructStatelessFF.h"
-#include "moses/InputPath.h"
-#include "moses/ScoreComponentCollection.h"
-using namespace std;
-
-namespace Moses
-{
-void InternalStructStatelessFF::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
-{
-// cerr << "MARIA!!!" << endl;
- scoreBreakdown.PlusEquals(this, 0);
-
-}
-
-void InternalStructStatelessFF::Evaluate(const InputType &input
- , const InputPath &inputPath
- , const TargetPhrase &targetPhrase
- , const StackVec *stackVec
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore) const
-{
-
- scoreBreakdown.PlusEquals(this, 66);
- /* FactorList f_mask;
- f_mask.push_back(0);
- //if(inputPath.GetPhrase().GetStringRep(f_mask).)
- int score =50;
- for(size_t i=0;i<inputPath.GetPhrase().GetSize();i++){
- if(inputPath.GetPhrase(). GetFactor(i,0)->GetString().as_string()=="ist"){
- //cout<<inputPath.GetPhrase().GetStringRep(f_mask);
- score+=1;
- }
- }
- scoreBreakdown.PlusEquals(this, score);
- */
-}
-
-}
-
diff --git a/moses/FF/InternalTree.cpp b/moses/FF/InternalTree.cpp
new file mode 100644
index 000000000..2537cc50f
--- /dev/null
+++ b/moses/FF/InternalTree.cpp
@@ -0,0 +1,230 @@
+#include "InternalTree.h"
+
+namespace Moses
+{
+
+InternalTree::InternalTree(const std::string & line, const bool terminal):
+ m_value_nt(0),
+ m_isTerminal(terminal)
+ {
+
+ size_t found = line.find_first_of("[] ");
+
+ if (found == line.npos) {
+ m_value = line;
+ }
+
+ else {
+ AddSubTree(line, 0);
+ }
+}
+
+size_t InternalTree::AddSubTree(const std::string & line, size_t pos) {
+
+ std::string value;
+ char token = 0;
+
+ while (token != ']' && pos != std::string::npos)
+ {
+ size_t oldpos = pos;
+ pos = line.find_first_of("[] ", pos);
+ if (pos == std::string::npos) break;
+ token = line[pos];
+ value = line.substr(oldpos,pos-oldpos);
+
+ if (token == '[') {
+ if (m_value.size() > 0) {
+ m_children.push_back(boost::make_shared<InternalTree>(value,false));
+ pos = m_children.back()->AddSubTree(line, pos+1);
+ }
+ else {
+ if (value.size() > 0) {
+ m_value = value;
+ }
+ pos = AddSubTree(line, pos+1);
+ }
+ }
+ else if (token == ' ' || token == ']') {
+ if (value.size() > 0 && !(m_value.size() > 0)) {
+ m_value = value;
+ }
+ else if (value.size() > 0) {
+ m_isTerminal = false;
+ m_children.push_back(boost::make_shared<InternalTree>(value,true));
+ }
+ if (token == ' ') {
+ pos++;
+ }
+ }
+
+ if (m_children.size() > 0) {
+ m_isTerminal = false;
+ }
+ }
+
+ if (pos == std::string::npos) {
+ return line.size();
+ }
+ return std::min(line.size(),pos+1);
+
+}
+
+std::string InternalTree::GetString(bool start) const {
+
+ std::string ret = "";
+ if (!start) {
+ ret += " ";
+ }
+
+ if (!m_isTerminal) {
+ ret += "[";
+ }
+
+ ret += m_value;
+ for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it)
+ {
+ ret += (*it)->GetString(false);
+ }
+
+ if (!m_isTerminal) {
+ ret += "]";
+ }
+ return ret;
+
+}
+
+
+void InternalTree::Combine(const std::vector<TreePointer> &previous) {
+
+ std::vector<TreePointer>::iterator it;
+ bool found = false;
+ leafNT next_leafNT(this);
+ for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
+ found = next_leafNT(it);
+ if (found) {
+ *it = *it_prev;
+ }
+ else {
+ std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
+ }
+ }
+}
+
+
+bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const {
+ for (it = m_children.begin(); it != m_children.end(); ++it) {
+ if ((*it)->GetLabel() == label) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const {
+ for (it = m_children.begin(); it != m_children.end(); ++it) {
+ if ((*it)->GetLabel() == label) {
+ return true;
+ }
+ std::vector<TreePointer>::const_iterator it2;
+ if ((*it)->RecursiveSearch(label, it2)) {
+ it = it2;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
+ for (it = m_children.begin(); it != m_children.end(); ++it) {
+ if ((*it)->GetLabel() == label) {
+ parent = this;
+ return true;
+ }
+ std::vector<TreePointer>::const_iterator it2;
+ if ((*it)->RecursiveSearch(label, it2, parent)) {
+ it = it2;
+ return true;
+ }
+ }
+ return false;
+}
+
+
+bool InternalTree::FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const {
+ for (it = m_children.begin(); it != m_children.end(); ++it) {
+ if ((*it)->GetNTLabel() == label) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const {
+ for (it = m_children.begin(); it != m_children.end(); ++it) {
+ if ((*it)->GetNTLabel() == label) {
+ return true;
+ }
+ std::vector<TreePointer>::const_iterator it2;
+ if ((*it)->RecursiveSearch(label, it2)) {
+ it = it2;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
+ for (it = m_children.begin(); it != m_children.end(); ++it) {
+ if ((*it)->GetNTLabel() == label) {
+ parent = this;
+ return true;
+ }
+ std::vector<TreePointer>::const_iterator it2;
+ if ((*it)->RecursiveSearch(label, it2, parent)) {
+ it = it2;
+ return true;
+ }
+ }
+ return false;
+}
+
+
+bool InternalTree::FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const {
+ for (it = m_children.begin(); it != m_children.end(); ++it) {
+ if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const {
+ for (it = m_children.begin(); it != m_children.end(); ++it) {
+ if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
+ return true;
+ }
+ std::vector<TreePointer>::const_iterator it2;
+ if ((*it)->RecursiveSearch(labels, it2)) {
+ it = it2;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
+ for (it = m_children.begin(); it != m_children.end(); ++it) {
+ if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
+ parent = this;
+ return true;
+ }
+ std::vector<TreePointer>::const_iterator it2;
+ if ((*it)->RecursiveSearch(labels, it2, parent)) {
+ it = it2;
+ return true;
+ }
+ }
+ return false;
+}
+
+}
diff --git a/moses/FF/InternalTree.h b/moses/FF/InternalTree.h
new file mode 100644
index 000000000..19006cdd3
--- /dev/null
+++ b/moses/FF/InternalTree.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include "FFState.h"
+#include <boost/shared_ptr.hpp>
+#include <boost/make_shared.hpp>
+#include "util/generator.hh"
+#include "util/exception.hh"
+
+namespace Moses
+{
+
+class InternalTree;
+typedef boost::shared_ptr<InternalTree> TreePointer;
+typedef int NTLabel;
+
+class InternalTree
+{
+std::string m_value;
+NTLabel m_value_nt;
+std::vector<TreePointer> m_children;
+bool m_isTerminal;
+public:
+ InternalTree(const std::string & line, const bool terminal = false);
+ InternalTree(const InternalTree & tree):
+ m_value(tree.m_value),
+ m_isTerminal(tree.m_isTerminal) {
+ const std::vector<TreePointer> & children = tree.m_children;
+ for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
+ m_children.push_back(boost::make_shared<InternalTree>(**it));
+ }
+ }
+ size_t AddSubTree(const std::string & line, size_t start);
+
+ std::string GetString(bool start = true) const;
+ void Combine(const std::vector<TreePointer> &previous);
+ const std::string & GetLabel() const {
+ return m_value;
+ }
+
+ // optionally identify label by int instead of string;
+ // allows abstraction if multiple nonterminal strings should map to same label.
+ const NTLabel & GetNTLabel() const {
+ return m_value_nt;
+ }
+
+ void SetNTLabel(NTLabel value) {
+ m_value_nt = value;
+ }
+
+ size_t GetLength() const {
+ return m_children.size();
+ }
+ std::vector<TreePointer> & GetChildren() {
+ return m_children;
+ }
+
+ bool IsTerminal() const {
+ return m_isTerminal;
+ }
+
+ bool IsLeafNT() const {
+ return (!m_isTerminal && m_children.size() == 0);
+ }
+
+ // different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents.
+ // can be used for formulating syntax constraints.
+
+ // if found, 'it' is iterator to first tree node that matches search string
+ bool FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
+ bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
+
+ // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
+ bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
+
+ // use NTLabel for search to reduce number of string comparisons / deal with synonymous labels
+ // if found, 'it' is iterator to first tree node that matches search string
+ bool FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
+ bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
+
+ // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
+ bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
+
+ // pass vector of possible labels to search
+ // if found, 'it' is iterator to first tree node that matches search string
+ bool FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
+ bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
+
+ // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
+ bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
+
+
+};
+
+class TreeState : public FFState
+{
+ TreePointer m_tree;
+public:
+ TreeState(TreePointer tree)
+ :m_tree(tree)
+ {}
+
+ TreePointer GetTree() const {
+ return m_tree;
+ }
+
+ int Compare(const FFState& other) const {return 0;};
+};
+
+// Python-like generator that yields next nonterminal leaf on every call
+$generator(leafNT) {
+ std::vector<TreePointer>::iterator it;
+ InternalTree* tree;
+ leafNT(InternalTree* root = 0): tree(root) {}
+ $emit(std::vector<TreePointer>::iterator)
+ for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
+ if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
+ $yield(it);
+ }
+ else if ((*it)->GetLength() > 0) {
+ if ((*it).get()) { // normal pointer to same object that TreePointer points to
+ $restart(tree = (*it).get());
+ }
+ }
+ }
+ $stop;
+};
+
+
+// Python-like generator that yields the parent of the next nonterminal leaf on every call
+$generator(leafNTParent) {
+ std::vector<TreePointer>::iterator it;
+ InternalTree* tree;
+ leafNTParent(InternalTree* root = 0): tree(root) {}
+ $emit(InternalTree*)
+ for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
+ if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
+ $yield(tree);
+ }
+ else if ((*it)->GetLength() > 0) {
+ if ((*it).get()) {
+ $restart(tree = (*it).get());
+ }
+ }
+ }
+ $stop;
+};
+
+// Python-like generator that yields the next nonterminal leaf on every call, and also stores the path from the root of the tree to the nonterminal
+$generator(leafNTPath) {
+ std::vector<TreePointer>::iterator it;
+ InternalTree* tree;
+ std::vector<InternalTree*> * path;
+ leafNTPath(InternalTree* root = NULL, std::vector<InternalTree*> * orig = NULL): tree(root), path(orig) {}
+ $emit(std::vector<TreePointer>::iterator)
+ path->push_back(tree);
+ for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
+ if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
+ path->push_back((*it).get());
+ $yield(it);
+ path->pop_back();
+ }
+ else if ((*it)->GetLength() > 0) {
+ if ((*it).get()) {
+ $restart(tree = (*it).get());
+ }
+ }
+ }
+ path->pop_back();
+ $stop;
+};
+
+
+} \ No newline at end of file
diff --git a/moses/FF/LexicalReordering/LexicalReordering.cpp b/moses/FF/LexicalReordering/LexicalReordering.cpp
index 967c22546..2f870f957 100644
--- a/moses/FF/LexicalReordering/LexicalReordering.cpp
+++ b/moses/FF/LexicalReordering/LexicalReordering.cpp
@@ -14,11 +14,13 @@ LexicalReordering::LexicalReordering(const std::string &line)
{
std::cerr << "Initializing LexicalReordering.." << std::endl;
+ map<string,string> sparseArgs;
+ m_haveDefaultScores = false;
for (size_t i = 0; i < m_args.size(); ++i) {
const vector<string> &args = m_args[i];
if (args[0] == "type") {
- m_configuration = new LexicalReorderingConfiguration(args[1]);
+ m_configuration.reset(new LexicalReorderingConfiguration(args[1]));
m_configuration->SetScoreProducer(this);
m_modelTypeString = m_configuration->GetModelString();
} else if (args[0] == "input-factor") {
@@ -27,8 +29,16 @@ LexicalReordering::LexicalReordering(const std::string &line)
m_factorsE =Tokenize<FactorType>(args[1]);
} else if (args[0] == "path") {
m_filePath = args[1];
+ } else if (args[0].substr(0,7) == "sparse-") {
+ sparseArgs[args[0].substr(7)] = args[1];
+ } else if (args[0] == "default-scores") {
+ vector<string> tokens = Tokenize(args[1],",");
+ for(size_t i=0; i<tokens.size(); i++) {
+ m_defaultScores.push_back( TransformScore( Scan<float>(tokens[i]) ) );
+ }
+ m_haveDefaultScores = true;
} else {
- throw "Unknown argument " + args[0];
+ UTIL_THROW(util::Exception,"Unknown argument " + args[0]);
}
}
@@ -36,29 +46,36 @@ LexicalReordering::LexicalReordering(const std::string &line)
case LexicalReorderingConfiguration::FE:
case LexicalReorderingConfiguration::E:
if(m_factorsE.empty()) {
- throw "TL factor mask for lexical reordering is unexpectedly empty";
+ UTIL_THROW(util::Exception,"TL factor mask for lexical reordering is unexpectedly empty");
}
if(m_configuration->GetCondition() == LexicalReorderingConfiguration::E)
break; // else fall through
case LexicalReorderingConfiguration::F:
if(m_factorsF.empty()) {
- throw "SL factor mask for lexical reordering is unexpectedly empty";
+ UTIL_THROW(util::Exception,"SL factor mask for lexical reordering is unexpectedly empty");
}
break;
default:
- throw "Unknown conditioning option!";
+ UTIL_THROW(util::Exception,"Unknown conditioning option!");
}
+
+ // sanity check: number of default scores
+ if (m_haveDefaultScores) {
+ if(m_defaultScores.size() != m_configuration->GetNumScoreComponents()) {
+ UTIL_THROW(util::Exception,"wrong number of default scores (" << m_defaultScores.size() << ") for lexicalized reordering model (expected " << m_configuration->GetNumScoreComponents() << ")");
+ }
+ }
+
+ m_configuration->ConfigureSparse(sparseArgs, this);
}
LexicalReordering::~LexicalReordering()
{
- delete m_table;
- delete m_configuration;
}
void LexicalReordering::Load()
{
- m_table = LexicalReorderingTable::LoadAvailable(m_filePath, m_factorsF, m_factorsE, std::vector<FactorType>());
+ m_table.reset(LexicalReorderingTable::LoadAvailable(m_filePath, m_factorsF, m_factorsE, std::vector<FactorType>()));
}
Scores LexicalReordering::GetProb(const Phrase& f, const Phrase& e) const
@@ -66,14 +83,14 @@ Scores LexicalReordering::GetProb(const Phrase& f, const Phrase& e) const
return m_table->GetScore(f, e, Phrase(ARRAY_SIZE_INCR));
}
-FFState* LexicalReordering::Evaluate(const Hypothesis& hypo,
+FFState* LexicalReordering::EvaluateWhenApplied(const Hypothesis& hypo,
const FFState* prev_state,
ScoreComponentCollection* out) const
{
VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) START" << std::endl);
Scores score(GetNumScoreComponents(), 0);
const LexicalReorderingState *prev = dynamic_cast<const LexicalReorderingState *>(prev_state);
- LexicalReorderingState *next_state = prev->Expand(hypo.GetTranslationOption(), score);
+ LexicalReorderingState *next_state = prev->Expand(hypo.GetTranslationOption(), hypo.GetInput(), out);
out->PlusEquals(this, score);
VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) END" << std::endl);
diff --git a/moses/FF/LexicalReordering/LexicalReordering.h b/moses/FF/LexicalReordering/LexicalReordering.h
index 4ff0057f0..2e73f8736 100644
--- a/moses/FF/LexicalReordering/LexicalReordering.h
+++ b/moses/FF/LexicalReordering/LexicalReordering.h
@@ -3,17 +3,20 @@
#include <string>
#include <vector>
+#include <boost/scoped_ptr.hpp>
#include "moses/Factor.h"
#include "moses/Phrase.h"
#include "moses/TypeDef.h"
#include "moses/Util.h"
#include "moses/WordsRange.h"
-#include "LexicalReorderingState.h"
-#include "LexicalReorderingTable.h"
#include "moses/FF/StatefulFeatureFunction.h"
#include "util/exception.hh"
+#include "LexicalReorderingState.h"
+#include "LexicalReorderingTable.h"
+#include "SparseReordering.h"
+
namespace Moses
{
@@ -42,43 +45,47 @@ public:
Scores GetProb(const Phrase& f, const Phrase& e) const;
- virtual FFState* Evaluate(const Hypothesis& cur_hypo,
+ virtual FFState* EvaluateWhenApplied(const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- virtual FFState* EvaluateChart(const ChartHypothesis&,
+ virtual FFState* EvaluateWhenApplied(const ChartHypothesis&,
int /* featureID */,
ScoreComponentCollection*) const {
UTIL_THROW(util::Exception, "LexicalReordering is not valid for chart decoder");
}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
+ bool GetHaveDefaultScores() { return m_haveDefaultScores; }
+ float GetDefaultScore( size_t i ) { return m_defaultScores[i]; }
private:
bool DecodeCondition(std::string s);
bool DecodeDirection(std::string s);
bool DecodeNumFeatureFunctions(std::string s);
- LexicalReorderingConfiguration *m_configuration;
+ boost::scoped_ptr<LexicalReorderingConfiguration> m_configuration;
std::string m_modelTypeString;
std::vector<std::string> m_modelType;
- LexicalReorderingTable* m_table;
+ boost::scoped_ptr<LexicalReorderingTable> m_table;
//std::vector<Direction> m_direction;
std::vector<LexicalReorderingConfiguration::Condition> m_condition;
//std::vector<size_t> m_scoreOffset;
//bool m_oneScorePerDirection;
std::vector<FactorType> m_factorsE, m_factorsF;
std::string m_filePath;
+ bool m_haveDefaultScores;
+ Scores m_defaultScores;
};
}
diff --git a/moses/FF/LexicalReordering/LexicalReorderingState.cpp b/moses/FF/LexicalReordering/LexicalReorderingState.cpp
index e3d3da453..847409496 100644
--- a/moses/FF/LexicalReordering/LexicalReorderingState.cpp
+++ b/moses/FF/LexicalReordering/LexicalReorderingState.cpp
@@ -5,11 +5,11 @@
#include "moses/FF/FFState.h"
#include "moses/Hypothesis.h"
#include "moses/WordsRange.h"
-#include "moses/ReorderingStack.h"
#include "moses/TranslationOption.h"
#include "LexicalReordering.h"
#include "LexicalReorderingState.h"
+#include "ReorderingStack.h"
namespace Moses
{
@@ -38,6 +38,14 @@ size_t LexicalReorderingConfiguration::GetNumScoreComponents() const
}
}
+void LexicalReorderingConfiguration::ConfigureSparse
+ (const std::map<std::string,std::string>& sparseArgs, const LexicalReordering* producer)
+{
+ if (sparseArgs.size()) {
+ m_sparse.reset(new SparseReordering(sparseArgs, producer));
+ }
+}
+
void LexicalReorderingConfiguration::SetAdditionalScoreComponents(size_t number)
{
m_additionalScoreComponents = number;
@@ -122,52 +130,66 @@ LexicalReorderingState *LexicalReorderingConfiguration::CreateLexicalReorderingS
return new BidirectionalReorderingState(*this, bwd, fwd, 0);
}
-void LexicalReorderingState::CopyScores(Scores& scores, const TranslationOption &topt, ReorderingType reoType) const
+void LexicalReorderingState::CopyScores(ScoreComponentCollection* accum, const TranslationOption &topt, const InputType& input, ReorderingType reoType) const
{
// don't call this on a bidirectional object
UTIL_THROW_IF2(m_direction != LexicalReorderingConfiguration::Backward && m_direction != LexicalReorderingConfiguration::Forward,
- "Unknown direction: " << m_direction);
- const Scores *cachedScores = (m_direction == LexicalReorderingConfiguration::Backward) ?
- topt.GetLexReorderingScores(m_configuration.GetScoreProducer()) : m_prevScore;
-
- // No scores available. TODO: Using a good prior distribution would be nicer.
- if(cachedScores == NULL)
- return;
-
- const Scores &scoreSet = *cachedScores;
- if(m_configuration.CollapseScores())
- scores[m_offset] = scoreSet[m_offset + reoType];
- else {
- std::fill(scores.begin() + m_offset, scores.begin() + m_offset + m_configuration.GetNumberOfTypes(), 0);
- scores[m_offset + reoType] = scoreSet[m_offset + reoType];
+ "Unknown direction: " << m_direction);
+ const TranslationOption* relevantOpt = &topt;
+ if (m_direction != LexicalReorderingConfiguration::Backward) relevantOpt = m_prevOption;
+ const Scores *cachedScores = relevantOpt->GetLexReorderingScores(m_configuration.GetScoreProducer());
+
+ // look up applicable score from vectore of scores
+ if(cachedScores) {
+ Scores scores(m_configuration.GetScoreProducer()->GetNumScoreComponents(),0);
+
+ const Scores &scoreSet = *cachedScores;
+ if(m_configuration.CollapseScores()) {
+ scores[m_offset] = scoreSet[m_offset + reoType];
+ }
+ else {
+ std::fill(scores.begin() + m_offset, scores.begin() + m_offset + m_configuration.GetNumberOfTypes(), 0);
+ scores[m_offset + reoType] = scoreSet[m_offset + reoType];
+ }
+ accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
}
-}
+ // else: use default scores (if specified)
+ else if (m_configuration.GetScoreProducer()->GetHaveDefaultScores()) {
+ Scores scores(m_configuration.GetScoreProducer()->GetNumScoreComponents(),0);
+ if(m_configuration.CollapseScores()) {
+ scores[m_offset] = m_configuration.GetScoreProducer()->GetDefaultScore(m_offset + reoType);
+ }
+ else {
+ scores[m_offset + reoType] = m_configuration.GetScoreProducer()->GetDefaultScore(m_offset + reoType);
+ }
+ accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
+ }
+ // note: if no default score, no cost
+
+ const SparseReordering* sparse = m_configuration.GetSparseReordering();
+ if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType, m_direction, accum);
-void LexicalReorderingState::ClearScores(Scores& scores) const
-{
- if(m_configuration.CollapseScores())
- scores[m_offset] = 0;
- else
- std::fill(scores.begin() + m_offset, scores.begin() + m_offset + m_configuration.GetNumberOfTypes(), 0);
}
-int LexicalReorderingState::ComparePrevScores(const Scores *other) const
+
+int LexicalReorderingState::ComparePrevScores(const TranslationOption *other) const
{
- if(m_prevScore == other)
+ const Scores* myPrevScores = m_prevOption->GetLexReorderingScores(m_configuration.GetScoreProducer());
+ const Scores* otherPrevScores = other->GetLexReorderingScores(m_configuration.GetScoreProducer());
+
+ if(myPrevScores == otherPrevScores)
return 0;
// The pointers are NULL if a phrase pair isn't found in the reordering table.
- if(other == NULL)
+ if(otherPrevScores == NULL)
return -1;
- if(m_prevScore == NULL)
+ if(myPrevScores == NULL)
return 1;
- const Scores &my = *m_prevScore;
- const Scores &their = *other;
for(size_t i = m_offset; i < m_offset + m_configuration.GetNumberOfTypes(); i++)
- if(my[i] < their[i])
+ if((*myPrevScores)[i] < (*otherPrevScores)[i])
return -1;
- else if(my[i] > their[i])
+ else if((*myPrevScores)[i] > (*otherPrevScores)[i])
return 1;
return 0;
@@ -189,11 +211,10 @@ int PhraseBasedReorderingState::Compare(const FFState& o) const
if (&o == this)
return 0;
- const PhraseBasedReorderingState* other = dynamic_cast<const PhraseBasedReorderingState*>(&o);
- UTIL_THROW_IF2(other == NULL, "Wrong state type");
+ const PhraseBasedReorderingState* other = static_cast<const PhraseBasedReorderingState*>(&o);
if (m_prevRange == other->m_prevRange) {
if (m_direction == LexicalReorderingConfiguration::Forward) {
- return ComparePrevScores(other->m_prevScore);
+ return ComparePrevScores(other->m_prevOption);
} else {
return 0;
}
@@ -203,27 +224,23 @@ int PhraseBasedReorderingState::Compare(const FFState& o) const
return 1;
}
-LexicalReorderingState* PhraseBasedReorderingState::Expand(const TranslationOption& topt, Scores& scores) const
+LexicalReorderingState* PhraseBasedReorderingState::Expand(const TranslationOption& topt, const InputType& input,ScoreComponentCollection* scores) const
{
ReorderingType reoType;
const WordsRange currWordsRange = topt.GetSourceWordsRange();
const LexicalReorderingConfiguration::ModelType modelType = m_configuration.GetModelType();
- if (m_direction == LexicalReorderingConfiguration::Forward && m_first) {
- ClearScores(scores);
- } else {
- if (!m_first || m_useFirstBackwardScore) {
- if (modelType == LexicalReorderingConfiguration::MSD) {
- reoType = GetOrientationTypeMSD(currWordsRange);
- } else if (modelType == LexicalReorderingConfiguration::MSLR) {
- reoType = GetOrientationTypeMSLR(currWordsRange);
- } else if (modelType == LexicalReorderingConfiguration::Monotonic) {
- reoType = GetOrientationTypeMonotonic(currWordsRange);
- } else {
- reoType = GetOrientationTypeLeftRight(currWordsRange);
- }
- CopyScores(scores, topt, reoType);
+ if ((m_direction != LexicalReorderingConfiguration::Forward && m_useFirstBackwardScore) || !m_first) {
+ if (modelType == LexicalReorderingConfiguration::MSD) {
+ reoType = GetOrientationTypeMSD(currWordsRange);
+ } else if (modelType == LexicalReorderingConfiguration::MSLR) {
+ reoType = GetOrientationTypeMSLR(currWordsRange);
+ } else if (modelType == LexicalReorderingConfiguration::Monotonic) {
+ reoType = GetOrientationTypeMonotonic(currWordsRange);
+ } else {
+ reoType = GetOrientationTypeLeftRight(currWordsRange);
}
+ CopyScores(scores, topt, input, reoType);
}
return new PhraseBasedReorderingState(this, topt);
@@ -292,7 +309,7 @@ int BidirectionalReorderingState::Compare(const FFState& o) const
if (&o == this)
return 0;
- const BidirectionalReorderingState &other = dynamic_cast<const BidirectionalReorderingState &>(o);
+ const BidirectionalReorderingState &other = static_cast<const BidirectionalReorderingState &>(o);
if(m_backward->Compare(*other.m_backward) < 0)
return -1;
else if(m_backward->Compare(*other.m_backward) > 0)
@@ -301,10 +318,10 @@ int BidirectionalReorderingState::Compare(const FFState& o) const
return m_forward->Compare(*other.m_forward);
}
-LexicalReorderingState* BidirectionalReorderingState::Expand(const TranslationOption& topt, Scores& scores) const
+LexicalReorderingState* BidirectionalReorderingState::Expand(const TranslationOption& topt, const InputType& input, ScoreComponentCollection* scores) const
{
- LexicalReorderingState *newbwd = m_backward->Expand(topt, scores);
- LexicalReorderingState *newfwd = m_forward->Expand(topt, scores);
+ LexicalReorderingState *newbwd = m_backward->Expand(topt,input, scores);
+ LexicalReorderingState *newfwd = m_forward->Expand(topt, input, scores);
return new BidirectionalReorderingState(m_configuration, newbwd, newfwd, m_offset);
}
@@ -321,11 +338,11 @@ HierarchicalReorderingBackwardState::HierarchicalReorderingBackwardState(const L
int HierarchicalReorderingBackwardState::Compare(const FFState& o) const
{
- const HierarchicalReorderingBackwardState& other = dynamic_cast<const HierarchicalReorderingBackwardState&>(o);
+ const HierarchicalReorderingBackwardState& other = static_cast<const HierarchicalReorderingBackwardState&>(o);
return m_reoStack.Compare(other.m_reoStack);
}
-LexicalReorderingState* HierarchicalReorderingBackwardState::Expand(const TranslationOption& topt, Scores& scores) const
+LexicalReorderingState* HierarchicalReorderingBackwardState::Expand(const TranslationOption& topt, const InputType& input,ScoreComponentCollection* scores) const
{
HierarchicalReorderingBackwardState* nextState = new HierarchicalReorderingBackwardState(this, topt, m_reoStack);
@@ -344,7 +361,7 @@ LexicalReorderingState* HierarchicalReorderingBackwardState::Expand(const Transl
reoType = GetOrientationTypeMonotonic(reoDistance);
}
- CopyScores(scores, topt, reoType);
+ CopyScores(scores, topt, input, reoType);
return nextState;
}
@@ -407,11 +424,10 @@ int HierarchicalReorderingForwardState::Compare(const FFState& o) const
if (&o == this)
return 0;
- const HierarchicalReorderingForwardState* other = dynamic_cast<const HierarchicalReorderingForwardState*>(&o);
- UTIL_THROW_IF2(other == NULL, "Wrong state type");
+ const HierarchicalReorderingForwardState* other = static_cast<const HierarchicalReorderingForwardState*>(&o);
if (m_prevRange == other->m_prevRange) {
- return ComparePrevScores(other->m_prevScore);
+ return ComparePrevScores(other->m_prevOption);
} else if (m_prevRange < other->m_prevRange) {
return -1;
}
@@ -429,7 +445,7 @@ int HierarchicalReorderingForwardState::Compare(const FFState& o) const
// dright: if the next phrase follows the conditioning phrase and other stuff comes in between
// dleft: if the next phrase precedes the conditioning phrase and other stuff comes in between
-LexicalReorderingState* HierarchicalReorderingForwardState::Expand(const TranslationOption& topt, Scores& scores) const
+LexicalReorderingState* HierarchicalReorderingForwardState::Expand(const TranslationOption& topt, const InputType& input,ScoreComponentCollection* scores) const
{
const LexicalReorderingConfiguration::ModelType modelType = m_configuration.GetModelType();
const WordsRange currWordsRange = topt.GetSourceWordsRange();
@@ -440,7 +456,7 @@ LexicalReorderingState* HierarchicalReorderingForwardState::Expand(const Transla
ReorderingType reoType;
if (m_first) {
- ClearScores(scores);
+
} else {
if (modelType == LexicalReorderingConfiguration::MSD) {
reoType = GetOrientationTypeMSD(currWordsRange, coverage);
@@ -452,7 +468,7 @@ LexicalReorderingState* HierarchicalReorderingForwardState::Expand(const Transla
reoType = GetOrientationTypeLeftRight(currWordsRange, coverage);
}
- CopyScores(scores, topt, reoType);
+ CopyScores(scores, topt, input, reoType);
}
return new HierarchicalReorderingForwardState(this, topt);
diff --git a/moses/FF/LexicalReordering/LexicalReorderingState.h b/moses/FF/LexicalReordering/LexicalReorderingState.h
index 8e237adc1..e309ed7f1 100644
--- a/moses/FF/LexicalReordering/LexicalReorderingState.h
+++ b/moses/FF/LexicalReordering/LexicalReorderingState.h
@@ -4,22 +4,25 @@
#include <vector>
#include <string>
+#include <boost/scoped_ptr.hpp>
+
#include "moses/Hypothesis.h"
-#include "LexicalReordering.h"
+//#include "LexicalReordering.h"
+#include "moses/ScoreComponentCollection.h"
#include "moses/WordsRange.h"
#include "moses/WordsBitmap.h"
-#include "moses/ReorderingStack.h"
#include "moses/TranslationOption.h"
#include "moses/FF/FFState.h"
+#include "ReorderingStack.h"
namespace Moses
{
class LexicalReorderingState;
class LexicalReordering;
+class SparseReordering;
/** Factory class for lexical reordering states
- * @todo There's a lot of classes for lexicalized reordering. Perhaps put them in a separate dir
*/
class LexicalReorderingConfiguration
{
@@ -31,6 +34,8 @@ public:
LexicalReorderingConfiguration(const std::string &modelType);
+ void ConfigureSparse(const std::map<std::string,std::string>& sparseArgs, const LexicalReordering* producer);
+
LexicalReorderingState *CreateLexicalReorderingState(const InputType &input) const;
size_t GetNumScoreComponents() const;
@@ -62,6 +67,10 @@ public:
return m_collapseScores;
}
+ const SparseReordering* GetSparseReordering() const {
+ return m_sparse.get();
+ }
+
private:
void SetScoreProducer(LexicalReordering* scoreProducer) {
m_scoreProducer = scoreProducer;
@@ -79,6 +88,7 @@ private:
Direction m_direction;
Condition m_condition;
size_t m_additionalScoreComponents;
+ boost::scoped_ptr<SparseReordering> m_sparse;
};
//! Abstract class for lexical reordering model states
@@ -86,34 +96,35 @@ class LexicalReorderingState : public FFState
{
public:
virtual int Compare(const FFState& o) const = 0;
- virtual LexicalReorderingState* Expand(const TranslationOption& hypo, Scores& scores) const = 0;
+ virtual LexicalReorderingState* Expand(const TranslationOption& hypo, const InputType& input, ScoreComponentCollection* scores) const = 0;
static LexicalReorderingState* CreateLexicalReorderingState(const std::vector<std::string>& config,
LexicalReorderingConfiguration::Direction dir, const InputType &input);
+ typedef int ReorderingType;
protected:
- typedef int ReorderingType;
const LexicalReorderingConfiguration &m_configuration;
// The following is the true direction of the object, which can be Backward or Forward even if the Configuration has Bidirectional.
LexicalReorderingConfiguration::Direction m_direction;
size_t m_offset;
- const Scores *m_prevScore;
+ //forward scores are conditioned on prev option, so need to remember it
+ const TranslationOption *m_prevOption;
inline LexicalReorderingState(const LexicalReorderingState *prev, const TranslationOption &topt) :
m_configuration(prev->m_configuration), m_direction(prev->m_direction), m_offset(prev->m_offset),
- m_prevScore(topt.GetLexReorderingScores(m_configuration.GetScoreProducer())) {}
+ m_prevOption(&topt) {}
inline LexicalReorderingState(const LexicalReorderingConfiguration &config, LexicalReorderingConfiguration::Direction dir, size_t offset)
- : m_configuration(config), m_direction(dir), m_offset(offset), m_prevScore(NULL) {}
+ : m_configuration(config), m_direction(dir), m_offset(offset), m_prevOption(NULL) {}
// copy the right scores in the right places, taking into account forward/backward, offset, collapse
- void CopyScores(Scores& scores, const TranslationOption& topt, ReorderingType reoType) const;
- void ClearScores(Scores& scores) const;
- int ComparePrevScores(const Scores *other) const;
+ void CopyScores(ScoreComponentCollection* scores, const TranslationOption& topt, const InputType& input, ReorderingType reoType) const;
+ int ComparePrevScores(const TranslationOption *other) const;
//constants for the different type of reorderings (corresponding to indexes in the table file)
+ public:
static const ReorderingType M = 0; // monotonic
static const ReorderingType NM = 1; // non-monotonic
static const ReorderingType S = 1; // swap
@@ -122,6 +133,7 @@ protected:
static const ReorderingType DR = 3; // discontinuous, right
static const ReorderingType R = 0; // right
static const ReorderingType L = 1; // left
+ static const ReorderingType MAX = 3; //largest possible
};
//! @todo what is this?
@@ -140,7 +152,7 @@ public:
}
virtual int Compare(const FFState& o) const;
- virtual LexicalReorderingState* Expand(const TranslationOption& topt, Scores& scores) const;
+ virtual LexicalReorderingState* Expand(const TranslationOption& topt, const InputType& input, ScoreComponentCollection* scores) const;
};
//! State for the standard Moses implementation of lexical reordering models
@@ -156,7 +168,7 @@ public:
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev, const TranslationOption &topt);
virtual int Compare(const FFState& o) const;
- virtual LexicalReorderingState* Expand(const TranslationOption& topt, Scores& scores) const;
+ virtual LexicalReorderingState* Expand(const TranslationOption& topt,const InputType& input, ScoreComponentCollection* scores) const;
ReorderingType GetOrientationTypeMSD(WordsRange currRange) const;
ReorderingType GetOrientationTypeMSLR(WordsRange currRange) const;
@@ -177,7 +189,7 @@ public:
const TranslationOption &topt, ReorderingStack reoStack);
virtual int Compare(const FFState& o) const;
- virtual LexicalReorderingState* Expand(const TranslationOption& hypo, Scores& scores) const;
+ virtual LexicalReorderingState* Expand(const TranslationOption& hypo, const InputType& input, ScoreComponentCollection* scores) const;
private:
ReorderingType GetOrientationTypeMSD(int reoDistance) const;
@@ -200,7 +212,7 @@ public:
HierarchicalReorderingForwardState(const HierarchicalReorderingForwardState *prev, const TranslationOption &topt);
virtual int Compare(const FFState& o) const;
- virtual LexicalReorderingState* Expand(const TranslationOption& hypo, Scores& scores) const;
+ virtual LexicalReorderingState* Expand(const TranslationOption& hypo, const InputType& input, ScoreComponentCollection* scores) const;
private:
ReorderingType GetOrientationTypeMSD(WordsRange currRange, WordsBitmap coverage) const;
diff --git a/moses/ReorderingStack.cpp b/moses/FF/LexicalReordering/ReorderingStack.cpp
index 49a723a36..49a723a36 100644
--- a/moses/ReorderingStack.cpp
+++ b/moses/FF/LexicalReordering/ReorderingStack.cpp
diff --git a/moses/ReorderingStack.h b/moses/FF/LexicalReordering/ReorderingStack.h
index 730b17ce3..5a5b80d16 100644
--- a/moses/ReorderingStack.h
+++ b/moses/FF/LexicalReordering/ReorderingStack.h
@@ -12,7 +12,7 @@
//#include "Phrase.h"
//#include "TypeDef.h"
//#include "Util.h"
-#include "WordsRange.h"
+#include "moses/WordsRange.h"
namespace Moses
{
diff --git a/moses/FF/LexicalReordering/SparseReordering.cpp b/moses/FF/LexicalReordering/SparseReordering.cpp
new file mode 100644
index 000000000..f62dcde8b
--- /dev/null
+++ b/moses/FF/LexicalReordering/SparseReordering.cpp
@@ -0,0 +1,254 @@
+#include <fstream>
+
+#include "moses/FactorCollection.h"
+#include "moses/InputPath.h"
+#include "moses/Util.h"
+
+#include "util/exception.hh"
+
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "LexicalReordering.h"
+#include "SparseReordering.h"
+
+
+using namespace std;
+
+namespace Moses
+{
+
+const std::string& SparseReorderingFeatureKey::Name (const string& wordListId) {
+ static string kSep = "-";
+ static string name;
+ ostringstream buf;
+ // type side position id word reotype
+ if (type == Phrase) {
+ buf << "phr";
+ } else if (type == Stack) {
+ buf << "stk";
+ } else if (type == Between) {
+ buf << "btn";
+ }
+ buf << kSep;
+ if (side == Source) {
+ buf << "src";
+ } else if (side == Target) {
+ buf << "tgt";
+ }
+ buf << kSep;
+ if (position == First) {
+ buf << "first";
+ } else if (position == Last) {
+ buf << "last";
+ }
+ buf << kSep;
+ buf << wordListId;
+ buf << kSep;
+ if (isCluster) buf << "cluster_";
+ buf << word->GetString();
+ buf << kSep;
+ buf << reoType;
+ name = buf.str();
+ return name;
+}
+
+SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer)
+ : m_producer(producer)
+{
+ static const string kSource= "source";
+ static const string kTarget = "target";
+ for (map<string,string>::const_iterator i = config.begin(); i != config.end(); ++i) {
+ vector<string> fields = Tokenize(i->first, "-");
+ if (fields[0] == "words") {
+ UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering word list name should be sparse-words-(source|target)-<id>");
+ if (fields[1] == kSource) {
+ ReadWordList(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceWordLists);
+ } else if (fields[1] == kTarget) {
+ ReadWordList(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetWordLists);
+ } else {
+ UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
+ }
+ } else if (fields[0] == "clusters") {
+ UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering cluster name should be sparse-clusters-(source|target)-<id>");
+ if (fields[1] == kSource) {
+ ReadClusterMap(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceClusterMaps);
+ } else if (fields[1] == kTarget) {
+ ReadClusterMap(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetClusterMaps);
+ } else {
+ UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
+ }
+
+ } else if (fields[0] == "phrase") {
+ m_usePhrase = true;
+ } else if (fields[0] == "stack") {
+ m_useStack = true;
+ } else if (fields[0] == "between") {
+ m_useBetween = true;
+ } else {
+ UTIL_THROW(util::Exception, "Unable to parse sparse reordering option: " << i->first);
+ }
+ }
+
+}
+
+void SparseReordering::PreCalculateFeatureNames(size_t index, const string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster) {
+ for (size_t type = SparseReorderingFeatureKey::Stack;
+ type <= SparseReorderingFeatureKey::Between; ++type) {
+ for (size_t position = SparseReorderingFeatureKey::First;
+ position <= SparseReorderingFeatureKey::Last; ++position) {
+ for (int reoType = 0; reoType <= LexicalReorderingState::MAX; ++reoType) {
+ SparseReorderingFeatureKey key(
+ index, static_cast<SparseReorderingFeatureKey::Type>(type), factor, isCluster,
+ static_cast<SparseReorderingFeatureKey::Position>(position), side, reoType);
+ m_featureMap.insert(pair<SparseReorderingFeatureKey, FName>(key,m_producer->GetFeatureName(key.Name(id))));
+ }
+ }
+ }
+}
+
+void SparseReordering::ReadWordList(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<WordList>* pWordLists) {
+ ifstream fh(filename.c_str());
+ UTIL_THROW_IF(!fh, util::Exception, "Unable to open: " << filename);
+ string line;
+ pWordLists->push_back(WordList());
+ pWordLists->back().first = id;
+ while (getline(fh,line)) {
+ //TODO: StringPiece
+ const Factor* factor = FactorCollection::Instance().AddFactor(line);
+ pWordLists->back().second.insert(factor);
+ PreCalculateFeatureNames(pWordLists->size()-1, id, side, factor, false);
+
+ }
+}
+
+void SparseReordering::ReadClusterMap(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<ClusterMap>* pClusterMaps) {
+ pClusterMaps->push_back(ClusterMap());
+ pClusterMaps->back().first = id;
+ util::FilePiece file(filename.c_str());
+ StringPiece line;
+ while (true) {
+ try {
+ line = file.ReadLine();
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
+ util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter('\t'));
+ if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing word): '" << line << "'");
+ const Factor* wordFactor = FactorCollection::Instance().AddFactor(*lineIter);
+ ++lineIter;
+ if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing cluster id): '" << line << "'");
+ const Factor* idFactor = FactorCollection::Instance().AddFactor(*lineIter);
+ pClusterMaps->back().second[wordFactor] = idFactor;
+ PreCalculateFeatureNames(pClusterMaps->size()-1, id, side, idFactor, true);
+ }
+}
+
+void SparseReordering::AddFeatures(
+ SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
+ const Word& word, SparseReorderingFeatureKey::Position position,
+ LexicalReorderingState::ReorderingType reoType,
+ ScoreComponentCollection* scores) const {
+
+ const Factor* wordFactor = word.GetFactor(0);
+
+ const vector<WordList>* wordLists;
+ const vector<ClusterMap>* clusterMaps;
+ if (side == SparseReorderingFeatureKey::Source) {
+ wordLists = &m_sourceWordLists;
+ clusterMaps = &m_sourceClusterMaps;
+ } else {
+ wordLists = &m_targetWordLists;
+ clusterMaps = &m_targetClusterMaps;
+ }
+
+ for (size_t id = 0; id < wordLists->size(); ++id) {
+ if ((*wordLists)[id].second.find(wordFactor) == (*wordLists)[id].second.end()) continue;
+ SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType);
+ FeatureMap::const_iterator fmi = m_featureMap.find(key);
+ assert(fmi != m_featureMap.end());
+ scores->SparsePlusEquals(fmi->second, 1.0);
+ }
+
+ for (size_t id = 0; id < clusterMaps->size(); ++id) {
+ const ClusterMap& clusterMap = (*clusterMaps)[id];
+ boost::unordered_map<const Factor*, const Factor*>::const_iterator clusterIter
+ = clusterMap.second.find(wordFactor);
+ if (clusterIter != clusterMap.second.end()) {
+ SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType);
+ FeatureMap::const_iterator fmi = m_featureMap.find(key);
+ assert(fmi != m_featureMap.end());
+ scores->SparsePlusEquals(fmi->second, 1.0);
+ }
+ }
+
+}
+
+void SparseReordering::CopyScores(
+ const TranslationOption& currentOpt,
+ const TranslationOption* previousOpt,
+ const InputType& input,
+ LexicalReorderingState::ReorderingType reoType,
+ LexicalReorderingConfiguration::Direction direction,
+ ScoreComponentCollection* scores) const
+{
+ if (m_useBetween && direction == LexicalReorderingConfiguration::Backward &&
+ (reoType == LexicalReorderingState::D || reoType == LexicalReorderingState::DL ||
+ reoType == LexicalReorderingState::DR)) {
+ size_t gapStart, gapEnd;
+ //NB: Using a static cast for speed, but could be nasty if
+ //using non-sentence input
+ const Sentence& sentence = static_cast<const Sentence&>(input);
+ const WordsRange& currentRange = currentOpt.GetSourceWordsRange();
+ if (previousOpt) {
+ const WordsRange& previousRange = previousOpt->GetSourceWordsRange();
+ if (previousRange < currentRange) {
+ gapStart = previousRange.GetEndPos() + 1;
+ gapEnd = currentRange.GetStartPos();
+ } else {
+ gapStart = currentRange.GetEndPos() + 1;
+ gapEnd = previousRange.GetStartPos();
+ }
+ } else {
+ //start of sentence
+ gapStart = 0;
+ gapEnd = currentRange.GetStartPos();
+ }
+ assert(gapStart < gapEnd);
+ for (size_t i = gapStart; i < gapEnd; ++i) {
+ AddFeatures(SparseReorderingFeatureKey::Between,
+ SparseReorderingFeatureKey::Source, sentence.GetWord(i),
+ SparseReorderingFeatureKey::First, reoType, scores);
+ }
+ }
+ //std::cerr << "SR " << topt << " " << reoType << " " << direction << std::endl;
+ //phrase (backward)
+ //stack (forward)
+ SparseReorderingFeatureKey::Type type;
+ if (direction == LexicalReorderingConfiguration::Forward) {
+ if (!m_useStack) return;
+ type = SparseReorderingFeatureKey::Stack;
+ } else if (direction == LexicalReorderingConfiguration::Backward) {
+ if (!m_usePhrase) return;
+ type = SparseReorderingFeatureKey::Phrase;
+ } else {
+ //Shouldn't be called for bidirectional
+ //keep compiler happy
+ type = SparseReorderingFeatureKey::Phrase;
+ assert(!"Shouldn't call CopyScores() with bidirectional direction");
+ }
+ const Phrase& sourcePhrase = currentOpt.GetInputPath().GetPhrase();
+ AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(0),
+ SparseReorderingFeatureKey::First, reoType, scores);
+ AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(sourcePhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores);
+ const Phrase& targetPhrase = currentOpt.GetTargetPhrase();
+ AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(0),
+ SparseReorderingFeatureKey::First, reoType, scores);
+ AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(targetPhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores);
+
+
+}
+
+} //namespace
+
diff --git a/moses/FF/LexicalReordering/SparseReordering.h b/moses/FF/LexicalReordering/SparseReordering.h
new file mode 100644
index 000000000..663785a88
--- /dev/null
+++ b/moses/FF/LexicalReordering/SparseReordering.h
@@ -0,0 +1,133 @@
+#ifndef moses_FF_LexicalReordering_SparseReordering_h
+#define moses_FF_LexicalReordering_SparseReordering_h
+
+/**
+ * Sparse reordering features for phrase-based MT, following Cherry (NAACL, 2013)
+**/
+
+
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <boost/unordered_set.hpp>
+
+#include "util/murmur_hash.hh"
+#include "util/pool.hh"
+#include "util/string_piece.hh"
+
+#include "moses/FeatureVector.h"
+#include "moses/ScoreComponentCollection.h"
+#include "LexicalReorderingState.h"
+
+/**
+ Configuration of sparse reordering:
+
+ The sparse reordering feature is configured using sparse-* configs in the lexical reordering line.
+ sparse-words-(source|target)-<id>=<filename> -- Features which fire for the words in the list
+ sparse-clusters-(source|target)-<id>=<filename> -- Features which fire for clusters in the list. Format
+ of cluster file TBD
+ sparse-phrase -- Add features which depend on the current phrase (backward)
+ sparse-stack -- Add features which depend on the previous phrase, or
+ top of stack. (forward)
+ sparse-between -- Add features which depend on words between previous phrase
+ (or top of stack) and current phrase.
+**/
+
+namespace Moses
+{
+
+/**
+ * Used to store pre-calculated feature names.
+**/
+struct SparseReorderingFeatureKey {
+ size_t id;
+ enum Type {Stack, Phrase, Between} type;
+ const Factor* word;
+ bool isCluster;
+ enum Position {First, Last} position;
+ enum Side {Source, Target} side;
+ LexicalReorderingState::ReorderingType reoType;
+
+ SparseReorderingFeatureKey(size_t id_, Type type_, const Factor* word_, bool isCluster_,
+ Position position_, Side side_, LexicalReorderingState::ReorderingType reoType_)
+ : id(id_), type(type_), word(word_), isCluster(isCluster_),
+ position(position_), side(side_), reoType(reoType_)
+ {}
+
+ const std::string& Name(const std::string& wordListId) ;
+};
+
+struct HashSparseReorderingFeatureKey : public std::unary_function<SparseReorderingFeatureKey, std::size_t> {
+ std::size_t operator()(const SparseReorderingFeatureKey& key) const {
+ //TODO: can we just hash the memory?
+ //not sure, there could be random padding
+ std::size_t seed = 0;
+ seed = util::MurmurHashNative(&key.id, sizeof(key.id), seed);
+ seed = util::MurmurHashNative(&key.type, sizeof(key.type), seed);
+ seed = util::MurmurHashNative(&key.word, sizeof(key.word), seed);
+ seed = util::MurmurHashNative(&key.isCluster, sizeof(key.isCluster), seed);
+ seed = util::MurmurHashNative(&key.position, sizeof(key.position), seed);
+ seed = util::MurmurHashNative(&key.side, sizeof(key.side), seed);
+ seed = util::MurmurHashNative(&key.reoType, sizeof(key.reoType), seed);
+ return seed;
+ }
+};
+
+struct EqualsSparseReorderingFeatureKey :
+ public std::binary_function<SparseReorderingFeatureKey, SparseReorderingFeatureKey, bool> {
+ bool operator()(const SparseReorderingFeatureKey& left, const SparseReorderingFeatureKey& right) const {
+ //TODO: Can we just compare the memory?
+ return left.id == right.id && left.type == right.type && left.word == right.word &&
+ left.position == right.position && left.side == right.side &&
+ left.reoType == right.reoType;
+ }
+};
+
+class SparseReordering
+{
+public:
+ SparseReordering(const std::map<std::string,std::string>& config, const LexicalReordering* producer);
+
+ //If direction is backward the options will be different, for forward they will be the same
+ void CopyScores(const TranslationOption& currentOpt,
+ const TranslationOption* previousOpt,
+ const InputType& input,
+ LexicalReorderingState::ReorderingType reoType,
+ LexicalReorderingConfiguration::Direction direction,
+ ScoreComponentCollection* scores) const ;
+
+private:
+ const LexicalReordering* m_producer;
+ typedef std::pair<std::string, boost::unordered_set<const Factor*> > WordList; //id and list
+ std::vector<WordList> m_sourceWordLists;
+ std::vector<WordList> m_targetWordLists;
+ typedef std::pair<std::string, boost::unordered_map<const Factor*, const Factor*> > ClusterMap; //id and map
+ std::vector<ClusterMap> m_sourceClusterMaps;
+ std::vector<ClusterMap> m_targetClusterMaps;
+ bool m_usePhrase;
+ bool m_useBetween;
+ bool m_useStack;
+ typedef boost::unordered_map<SparseReorderingFeatureKey, FName, HashSparseReorderingFeatureKey, EqualsSparseReorderingFeatureKey> FeatureMap;
+ FeatureMap m_featureMap;
+
+ void ReadWordList(const std::string& filename, const std::string& id,
+ SparseReorderingFeatureKey::Side side, std::vector<WordList>* pWordLists);
+ void ReadClusterMap(const std::string& filename, const std::string& id, SparseReorderingFeatureKey::Side side, std::vector<ClusterMap>* pClusterMaps);
+ void PreCalculateFeatureNames(size_t index, const std::string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster);
+
+ void AddFeatures(
+ SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
+ const Word& word, SparseReorderingFeatureKey::Position position,
+ LexicalReorderingState::ReorderingType reoType,
+ ScoreComponentCollection* scores) const;
+
+};
+
+
+
+} //namespace
+
+
+#endif
diff --git a/moses/FF/MaxSpanFreeNonTermSource.cpp b/moses/FF/MaxSpanFreeNonTermSource.cpp
index 987fd713c..9de582635 100644
--- a/moses/FF/MaxSpanFreeNonTermSource.cpp
+++ b/moses/FF/MaxSpanFreeNonTermSource.cpp
@@ -14,10 +14,10 @@ using namespace std;
namespace Moses
{
MaxSpanFreeNonTermSource::MaxSpanFreeNonTermSource(const std::string &line)
- :StatelessFeatureFunction(1, line)
- ,m_maxSpan(2)
- ,m_glueTargetLHSStr("S")
- ,m_glueTargetLHS(true)
+:StatelessFeatureFunction(1, line)
+,m_maxSpan(2)
+,m_glueTargetLHSStr("S")
+,m_glueTargetLHS(true)
{
m_tuneable = false;
ReadParameters();
@@ -27,26 +27,26 @@ MaxSpanFreeNonTermSource::MaxSpanFreeNonTermSource(const std::string &line)
m_glueTargetLHS.SetFactor(0, factor);
}
-void MaxSpanFreeNonTermSource::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+void MaxSpanFreeNonTermSource::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
targetPhrase.SetRuleSource(source);
}
-void MaxSpanFreeNonTermSource::Evaluate(const InputType &input
- , const InputPath &inputPath
- , const TargetPhrase &targetPhrase
- , const StackVec *stackVec
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore) const
+void MaxSpanFreeNonTermSource::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
{
const Word &targetLHS = targetPhrase.GetTargetLHS();
if (targetLHS == m_glueTargetLHS) {
- // don't delete glue rules
- return;
+ // don't delete glue rules
+ return;
}
const Phrase *source = targetPhrase.GetRuleSource();
@@ -54,17 +54,17 @@ void MaxSpanFreeNonTermSource::Evaluate(const InputType &input
float score = 0;
if (source->Front().IsNonTerminal()) {
- const ChartCellLabel &cell = *stackVec->front();
- if (cell.GetCoverage().GetNumWordsCovered() > m_maxSpan) {
- score = - std::numeric_limits<float>::infinity();
- }
+ const ChartCellLabel &cell = *stackVec->front();
+ if (cell.GetCoverage().GetNumWordsCovered() > m_maxSpan) {
+ score = - std::numeric_limits<float>::infinity();
+ }
}
if (source->Back().IsNonTerminal()) {
- const ChartCellLabel &cell = *stackVec->back();
- if (cell.GetCoverage().GetNumWordsCovered() > m_maxSpan) {
- score = - std::numeric_limits<float>::infinity();
- }
+ const ChartCellLabel &cell = *stackVec->back();
+ if (cell.GetCoverage().GetNumWordsCovered() > m_maxSpan) {
+ score = - std::numeric_limits<float>::infinity();
+ }
}
@@ -76,7 +76,7 @@ void MaxSpanFreeNonTermSource::Evaluate(const InputType &input
void MaxSpanFreeNonTermSource::SetParameter(const std::string& key, const std::string& value)
{
if (key == "max-span") {
- m_maxSpan = Scan<int>(value);
+ m_maxSpan = Scan<int>(value);
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
@@ -84,8 +84,8 @@ void MaxSpanFreeNonTermSource::SetParameter(const std::string& key, const std::s
std::vector<float> MaxSpanFreeNonTermSource::DefaultWeights() const
{
- std::vector<float> ret(1, 1);
- return ret;
+ std::vector<float> ret(1, 1);
+ return ret;
}
}
diff --git a/moses/FF/MaxSpanFreeNonTermSource.h b/moses/FF/MaxSpanFreeNonTermSource.h
index 9c85def83..973b374d8 100644
--- a/moses/FF/MaxSpanFreeNonTermSource.h
+++ b/moses/FF/MaxSpanFreeNonTermSource.h
@@ -10,34 +10,33 @@ namespace Moses
class MaxSpanFreeNonTermSource : public StatelessFeatureFunction
{
public:
- MaxSpanFreeNonTermSource(const std::string &line);
-
- virtual bool IsUseable(const FactorMask &mask) const {
- return true;
- }
-
- virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
-
- virtual void Evaluate(const InputType &input
- , const InputPath &inputPath
- , const TargetPhrase &targetPhrase
- , const StackVec *stackVec
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore = NULL) const;
-
- virtual void Evaluate(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
- {}
-
- virtual void EvaluateChart(const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const
- {}
-
- void SetParameter(const std::string& key, const std::string& value);
- std::vector<float> DefaultWeights() const;
+ MaxSpanFreeNonTermSource(const std::string &line);
+
+ virtual bool IsUseable(const FactorMask &mask) const
+ { return true; }
+
+ virtual void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
+
+ virtual void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+ virtual void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ void SetParameter(const std::string& key, const std::string& value);
+ std::vector<float> DefaultWeights() const;
protected:
int m_maxSpan;
diff --git a/moses/FF/NieceTerminal.cpp b/moses/FF/NieceTerminal.cpp
index 3d8fd3d22..b3a5f8f92 100644
--- a/moses/FF/NieceTerminal.cpp
+++ b/moses/FF/NieceTerminal.cpp
@@ -17,20 +17,28 @@ NieceTerminal::NieceTerminal(const std::string &line)
ReadParameters();
}
-void NieceTerminal::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+std::vector<float> NieceTerminal::DefaultWeights() const
+{
+ UTIL_THROW_IF2(m_numScoreComponents != 1,
+ "NieceTerminal must only have 1 score");
+ vector<float> ret(1, 1);
+ return ret;
+}
+
+void NieceTerminal::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
targetPhrase.SetRuleSource(source);
}
-void NieceTerminal::Evaluate(const InputType &input
- , const InputPath &inputPath
- , const TargetPhrase &targetPhrase
- , const StackVec *stackVec
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore) const
+void NieceTerminal::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
{
assert(stackVec);
@@ -39,59 +47,59 @@ void NieceTerminal::Evaluate(const InputType &input
std::set<Word> terms;
for (size_t i = 0; i < ruleSource->GetSize(); ++i) {
- const Word &word = ruleSource->GetWord(i);
- if (!word.IsNonTerminal()) {
- terms.insert(word);
- }
+ const Word &word = ruleSource->GetWord(i);
+ if (!word.IsNonTerminal()) {
+ terms.insert(word);
+ }
}
for (size_t i = 0; i < stackVec->size(); ++i) {
- const ChartCellLabel &cell = *stackVec->at(i);
- const WordsRange &ntRange = cell.GetCoverage();
- bool containTerm = ContainTerm(input, ntRange, terms);
-
- if (containTerm) {
- //cerr << "ruleSource=" << *ruleSource << " ";
- //cerr << "ntRange=" << ntRange << endl;
-
- // non-term contains 1 of the terms in the rule.
- float score = m_hardConstraint ? - std::numeric_limits<float>::infinity() : 1;
- scoreBreakdown.PlusEquals(this, score);
- return;
- }
+ const ChartCellLabel &cell = *stackVec->at(i);
+ const WordsRange &ntRange = cell.GetCoverage();
+ bool containTerm = ContainTerm(input, ntRange, terms);
+
+ if (containTerm) {
+ //cerr << "ruleSource=" << *ruleSource << " ";
+ //cerr << "ntRange=" << ntRange << endl;
+
+ // non-term contains 1 of the terms in the rule.
+ float score = m_hardConstraint ? - std::numeric_limits<float>::infinity() : 1;
+ scoreBreakdown.PlusEquals(this, score);
+ return;
+ }
}
}
-void NieceTerminal::Evaluate(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
+void NieceTerminal::EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
{}
-void NieceTerminal::EvaluateChart(const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const
+void NieceTerminal::EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
{}
bool NieceTerminal::ContainTerm(const InputType &input,
- const WordsRange &ntRange,
- const std::set<Word> &terms) const
+ const WordsRange &ntRange,
+ const std::set<Word> &terms) const
{
- std::set<Word>::const_iterator iter;
+ std::set<Word>::const_iterator iter;
- for (size_t pos = ntRange.GetStartPos(); pos <= ntRange.GetEndPos(); ++pos) {
- const Word &word = input.GetWord(pos);
- iter = terms.find(word);
+ for (size_t pos = ntRange.GetStartPos(); pos <= ntRange.GetEndPos(); ++pos) {
+ const Word &word = input.GetWord(pos);
+ iter = terms.find(word);
- if (iter != terms.end()) {
- return true;
- }
- }
- return false;
+ if (iter != terms.end()) {
+ return true;
+ }
+ }
+ return false;
}
void NieceTerminal::SetParameter(const std::string& key, const std::string& value)
{
if (key == "hard-constraint") {
- m_hardConstraint = Scan<bool>(value);
+ m_hardConstraint = Scan<bool>(value);
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
diff --git a/moses/FF/NieceTerminal.h b/moses/FF/NieceTerminal.h
index d819972ba..2f88f0934 100644
--- a/moses/FF/NieceTerminal.h
+++ b/moses/FF/NieceTerminal.h
@@ -19,22 +19,23 @@ public:
return true;
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const;
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const;
void SetParameter(const std::string& key, const std::string& value);
+ std::vector<float> DefaultWeights() const;
protected:
bool m_hardConstraint;
diff --git a/moses/FF/OSM-Feature/KenOSM.cpp b/moses/FF/OSM-Feature/KenOSM.cpp
new file mode 100644
index 000000000..e517200c3
--- /dev/null
+++ b/moses/FF/OSM-Feature/KenOSM.cpp
@@ -0,0 +1,32 @@
+#include "KenOSM.h"
+
+namespace Moses
+{
+
+OSMLM* ConstructOSMLM(const std::string &file)
+{
+ lm::ngram::ModelType model_type;
+ if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
+
+ switch(model_type) {
+ case lm::ngram::PROBING:
+ return new KenOSM<lm::ngram::ProbingModel>(file);
+ case lm::ngram::REST_PROBING:
+ return new KenOSM<lm::ngram::RestProbingModel>(file);
+ case lm::ngram::TRIE:
+ return new KenOSM<lm::ngram::TrieModel>(file);
+ case lm::ngram::QUANT_TRIE:
+ return new KenOSM<lm::ngram::QuantTrieModel>(file);
+ case lm::ngram::ARRAY_TRIE:
+ return new KenOSM<lm::ngram::ArrayTrieModel>(file);
+ case lm::ngram::QUANT_ARRAY_TRIE:
+ return new KenOSM<lm::ngram::QuantArrayTrieModel>(file);
+ default:
+ UTIL_THROW2("Unrecognized kenlm model type " << model_type);
+ }
+ } else {
+ return new KenOSM<lm::ngram::ProbingModel>(file);
+ }
+}
+
+} // namespace
diff --git a/moses/FF/OSM-Feature/KenOSM.h b/moses/FF/OSM-Feature/KenOSM.h
new file mode 100644
index 000000000..d3d8672d3
--- /dev/null
+++ b/moses/FF/OSM-Feature/KenOSM.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <string>
+#include "lm/model.hh"
+#include <boost/shared_ptr.hpp>
+
+namespace Moses
+{
+
+class KenOSMBase {
+ public:
+ virtual float Score(const lm::ngram::State&, const std::string&,
+ lm::ngram::State&) const = 0;
+
+ virtual const lm::ngram::State &BeginSentenceState() const = 0;
+
+ virtual const lm::ngram::State &NullContextState() const = 0;
+};
+
+template <class KenModel>
+class KenOSM : public KenOSMBase {
+ public:
+ KenOSM(const std::string& file)
+ : m_kenlm(new KenModel(file.c_str())) {}
+
+ virtual float Score(const lm::ngram::State &in_state,
+ const std::string& word,
+ lm::ngram::State &out_state) const {
+ return m_kenlm->Score(in_state, m_kenlm->GetVocabulary().Index(word),
+ out_state);
+ }
+
+ virtual const lm::ngram::State &BeginSentenceState() const {
+ return m_kenlm->BeginSentenceState();
+ }
+
+ virtual const lm::ngram::State &NullContextState() const {
+ return m_kenlm->NullContextState();
+ }
+
+ private:
+ boost::shared_ptr<KenModel> m_kenlm;
+};
+
+typedef KenOSMBase OSMLM;
+
+OSMLM* ConstructOSMLM(const std::string &file);
+
+
+} // namespace
diff --git a/moses/FF/OSM-Feature/OpSequenceModel.cpp b/moses/FF/OSM-Feature/OpSequenceModel.cpp
index 0bb7aed95..6d839f0cc 100644
--- a/moses/FF/OSM-Feature/OpSequenceModel.cpp
+++ b/moses/FF/OSM-Feature/OpSequenceModel.cpp
@@ -19,19 +19,18 @@ OpSequenceModel::OpSequenceModel(const std::string &line)
ReadParameters();
}
-OpSequenceModel::~OpSequenceModel()
-{
- delete OSM;
+OpSequenceModel::~OpSequenceModel() {
+ delete OSM;
}
void OpSequenceModel :: readLanguageModel(const char *lmFile)
{
-
string unkOp = "_TRANS_SLF_";
- OSM = new Model(m_lmPath.c_str());
+ OSM = ConstructOSMLM(m_lmPath);
+
State startState = OSM->NullContextState();
State endState;
- unkOpProb = OSM->Score(startState,OSM->GetVocabulary().Index(unkOp),endState);
+ unkOpProb = OSM->Score(startState,unkOp,endState);
}
@@ -42,7 +41,7 @@ void OpSequenceModel::Load()
-void OpSequenceModel:: Evaluate(const Phrase &source
+void OpSequenceModel:: EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
@@ -87,7 +86,7 @@ void OpSequenceModel:: Evaluate(const Phrase &source
}
-FFState* OpSequenceModel::Evaluate(
+FFState* OpSequenceModel::EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
@@ -194,12 +193,12 @@ FFState* OpSequenceModel::Evaluate(
// return NULL;
}
-FFState* OpSequenceModel::EvaluateChart(
+FFState* OpSequenceModel::EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
{
- UTIL_THROW2("Chart decoding not support by UTIL_THROW2");
+ UTIL_THROW2("Chart decoding not support by UTIL_THROW2");
}
diff --git a/moses/FF/OSM-Feature/OpSequenceModel.h b/moses/FF/OSM-Feature/OpSequenceModel.h
index 64cab3044..8c71e8152 100644
--- a/moses/FF/OSM-Feature/OpSequenceModel.h
+++ b/moses/FF/OSM-Feature/OpSequenceModel.h
@@ -6,8 +6,7 @@
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/Manager.h"
#include "moses/FF/OSM-Feature/osmHyp.h"
-#include "lm/model.hh"
-
+#include "KenOSM.h"
namespace Moses
{
@@ -16,8 +15,7 @@ class OpSequenceModel : public StatefulFeatureFunction
{
public:
-
- lm::ngram::Model * OSM;
+ OSMLM* OSM;
float unkOpProb;
int sFactor; // Source Factor ...
int tFactor; // Target Factor ...
@@ -29,24 +27,24 @@ public:
void readLanguageModel(const char *);
void Load();
- FFState* Evaluate(
+ FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- virtual FFState* EvaluateChart(
+ virtual FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
diff --git a/moses/FF/OSM-Feature/osmHyp.cpp b/moses/FF/OSM-Feature/osmHyp.cpp
index d492888ed..422b7c933 100644
--- a/moses/FF/OSM-Feature/osmHyp.cpp
+++ b/moses/FF/OSM-Feature/osmHyp.cpp
@@ -121,7 +121,7 @@ void osmHypothesis :: removeReorderingOperations()
operations = tupleSequence;
}
-void osmHypothesis :: calculateOSMProb(Model & ptrOp)
+void osmHypothesis :: calculateOSMProb(OSMLM& ptrOp)
{
opProb = 0;
@@ -130,7 +130,7 @@ void osmHypothesis :: calculateOSMProb(Model & ptrOp)
for (int i = 0; i<operations.size(); i++) {
temp = currState;
- opProb += ptrOp.Score(temp,ptrOp.GetVocabulary().Index(operations[i]),currState);
+ opProb += ptrOp.Score(temp,operations[i],currState);
}
lmState = currState;
diff --git a/moses/FF/OSM-Feature/osmHyp.h b/moses/FF/OSM-Feature/osmHyp.h
index 2459b4d23..88f171188 100644
--- a/moses/FF/OSM-Feature/osmHyp.h
+++ b/moses/FF/OSM-Feature/osmHyp.h
@@ -2,12 +2,13 @@
# include "moses/FF/FFState.h"
# include "moses/Manager.h"
-#include "lm/model.hh"
# include <set>
# include <map>
# include <string>
# include <vector>
+#include "KenOSM.h"
+
namespace Moses
{
@@ -79,7 +80,7 @@ public:
~osmHypothesis() {};
void generateOperations(int & startIndex, int j1 , int contFlag , WordsBitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF);
void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes);
- void calculateOSMProb(lm::ngram::Model & ptrOp);
+ void calculateOSMProb(OSMLM& ptrOp);
void computeOSMFeature(int startIndex , WordsBitmap & coverageVector);
void constructCepts(std::vector <int> & align , int startIndex , int endIndex, int targetPhraseLength);
void setPhrases(std::vector <std::string> & val1 , std::vector <std::string> & val2) {
diff --git a/moses/FF/PhraseBoundaryFeature.cpp b/moses/FF/PhraseBoundaryFeature.cpp
index d82181b76..3fdcf27f9 100644
--- a/moses/FF/PhraseBoundaryFeature.cpp
+++ b/moses/FF/PhraseBoundaryFeature.cpp
@@ -66,7 +66,7 @@ void PhraseBoundaryFeature::AddFeatures(
}
-FFState* PhraseBoundaryFeature::Evaluate
+FFState* PhraseBoundaryFeature::EvaluateWhenApplied
(const Hypothesis& cur_hypo, const FFState* prev_state,
ScoreComponentCollection* scores) const
{
diff --git a/moses/FF/PhraseBoundaryFeature.h b/moses/FF/PhraseBoundaryFeature.h
index fbafc6da9..e4c3ca3ba 100644
--- a/moses/FF/PhraseBoundaryFeature.h
+++ b/moses/FF/PhraseBoundaryFeature.h
@@ -44,23 +44,23 @@ public:
virtual const FFState* EmptyHypothesisState(const InputType &) const;
- virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
+ virtual FFState* EvaluateWhenApplied(const Hypothesis& cur_hypo, const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- virtual FFState* EvaluateChart( const ChartHypothesis& /* cur_hypo */,
+ virtual FFState* EvaluateWhenApplied( const ChartHypothesis& /* cur_hypo */,
int /* featureID */,
ScoreComponentCollection* ) const {
throw std::logic_error("PhraseBoundaryState not supported in chart decoder, yet");
}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/PhraseLengthFeature.cpp b/moses/FF/PhraseLengthFeature.cpp
index 43e0d1b2d..7850c374a 100644
--- a/moses/FF/PhraseLengthFeature.cpp
+++ b/moses/FF/PhraseLengthFeature.cpp
@@ -15,7 +15,7 @@ PhraseLengthFeature::PhraseLengthFeature(const std::string &line)
ReadParameters();
}
-void PhraseLengthFeature::Evaluate(const Phrase &source
+void PhraseLengthFeature::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/PhraseLengthFeature.h b/moses/FF/PhraseLengthFeature.h
index ba835f654..9233aa3e7 100644
--- a/moses/FF/PhraseLengthFeature.h
+++ b/moses/FF/PhraseLengthFeature.h
@@ -24,16 +24,15 @@ public:
return true;
}
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(const ChartHypothesis& hypo,
- ScoreComponentCollection*) const {
- throw std::logic_error("PhraseLengthFeature not valid in chart decoder");
- }
+ void EvaluateWhenApplied(const ChartHypothesis& hypo,
+ ScoreComponentCollection*) const
+ {}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
@@ -41,7 +40,7 @@ public:
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- virtual void Evaluate(const Phrase &source
+ virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
diff --git a/moses/FF/PhraseOrientationFeature.cpp b/moses/FF/PhraseOrientationFeature.cpp
new file mode 100644
index 000000000..70023a793
--- /dev/null
+++ b/moses/FF/PhraseOrientationFeature.cpp
@@ -0,0 +1,617 @@
+//
+// REFERENCE
+// ---------
+// When using this feature, please cite:
+//
+// Matthias Huck, Joern Wuebker, Felix Rietig, and Hermann Ney.
+// A Phrase Orientation Model for Hierarchical Machine Translation.
+// In ACL 2013 Eighth Workshop on Statistical Machine Translation (WMT 2013), pages 452-463, Sofia, Bulgaria, August 2013.
+//
+
+#include "PhraseOrientationFeature.h"
+#include "moses/InputFileStream.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/StaticData.h"
+#include "moses/Hypothesis.h"
+#include "moses/ChartHypothesis.h"
+#include "moses/ChartManager.h"
+#include "moses/FactorCollection.h"
+#include "moses/PP/OrientationPhraseProperty.h"
+#include "phrase-extract/extract-ghkm/Alignment.h"
+
+
+namespace Moses
+{
+
+PhraseOrientationFeature::PhraseOrientationFeature(const std::string &line)
+ : StatefulFeatureFunction(6, line)
+ , m_glueTargetLHSStr("Q")
+ , m_glueTargetLHS(true)
+ , m_offsetR2LScores(0)
+{
+ VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
+ ReadParameters();
+ FactorCollection &fc = FactorCollection::Instance();
+ const Factor *factor = fc.AddFactor(m_glueTargetLHSStr, true);
+ m_glueTargetLHS.SetFactor(0, factor);
+ m_offsetR2LScores = m_numScoreComponents / 2;
+ VERBOSE(1, " Done." << std::endl);
+}
+
+void PhraseOrientationFeature::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "glueTargetLHS")
+ {
+ m_glueTargetLHSStr = value;
+ }
+ else
+ {
+ StatefulFeatureFunction::SetParameter(key, value);
+ }
+}
+
+
+FFState* PhraseOrientationFeature::EvaluateWhenApplied(
+ const ChartHypothesis& hypo,
+ int featureID, // used to index the state in the previous hypotheses
+ ScoreComponentCollection* accumulator) const
+{
+ // Dense scores
+ std::vector<float> newScores(m_numScoreComponents,0);
+
+ // State: ignored wrt. recombination; used to propagate orientation probabilities in case of boundary non-terminals
+ PhraseOrientationFeatureState *state = new PhraseOrientationFeatureState();
+
+ // Read Orientation property
+ const TargetPhrase &currTarPhr = hypo.GetCurrTargetPhrase();
+ const Word &currTarPhrLHS = currTarPhr.GetTargetLHS();
+ const Phrase *currSrcPhr = currTarPhr.GetRuleSource();
+// const Factor* targetLHS = currTarPhr.GetTargetLHS()[0];
+// bool isGlueGrammarRule = false;
+
+ FEATUREVERBOSE(2, *currSrcPhr << std::endl);
+ FEATUREVERBOSE(2, currTarPhr << std::endl);
+
+ Moses::GHKM::Alignment alignment; // TODO: Efficiency! It's not necessary to fill a Moses::GHKM::Alignment object and then touch everything again in Moses::GHKM::PhraseOrientation's constructor
+
+ for (AlignmentInfo::const_iterator it=currTarPhr.GetAlignTerm().begin();
+ it!=currTarPhr.GetAlignTerm().end(); ++it)
+ {
+ alignment.push_back(std::make_pair(it->first, it->second));
+ FEATUREVERBOSE(2, "alignTerm " << it->first << " " << it->second << std::endl);
+ }
+
+ for (AlignmentInfo::const_iterator it=currTarPhr.GetAlignNonTerm().begin();
+ it!=currTarPhr.GetAlignNonTerm().end(); ++it)
+ {
+ alignment.push_back(std::make_pair(it->first, it->second));
+ FEATUREVERBOSE(2, "alignNonTerm " << it->first << " " << it->second << std::endl);
+ }
+
+ // Initialize phrase orientation scoring object
+ Moses::GHKM::PhraseOrientation phraseOrientation(currSrcPhr->GetSize(), currTarPhr.GetSize(), alignment); // TODO: Efficiency! This should be precomputed.
+
+ // Get index map for underlying hypotheses
+ const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
+ currTarPhr.GetAlignNonTerm().GetNonTermIndexMap();
+
+ // Determine & score orientations
+
+ for (AlignmentInfo::const_iterator it=currTarPhr.GetAlignNonTerm().begin();
+ it!=currTarPhr.GetAlignNonTerm().end(); ++it)
+ {
+ size_t sourceIndex = it->first;
+ size_t targetIndex = it->second;
+ size_t nonTermIndex = nonTermIndexMap[targetIndex];
+
+ FEATUREVERBOSE(2, "Scoring nonTermIndex= " << nonTermIndex << " targetIndex= " << targetIndex << " sourceIndex= " << sourceIndex << std::endl);
+
+ // consult subderivation
+ const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
+ const TargetPhrase &prevTarPhr = prevHypo->GetCurrTargetPhrase();
+
+ if (const PhraseProperty *property = prevTarPhr.GetProperty("Orientation"))
+ {
+ const OrientationPhraseProperty *orientationPhraseProperty = static_cast<const OrientationPhraseProperty*>(property);
+
+ FEATUREVERBOSE(5, "orientationPhraseProperty: "
+ << "L2R_Mono " << orientationPhraseProperty->GetLeftToRightProbabilityMono()
+ << " L2R_Swap " << orientationPhraseProperty->GetLeftToRightProbabilitySwap()
+ << " L2R_Dright " << orientationPhraseProperty->GetLeftToRightProbabilityDright()
+ << " L2R_Dleft " << orientationPhraseProperty->GetLeftToRightProbabilityDleft()
+ << " R2L_Mono " << orientationPhraseProperty->GetRightToLeftProbabilityMono()
+ << " R2L_Swap " << orientationPhraseProperty->GetRightToLeftProbabilitySwap()
+ << " R2L_Dright " << orientationPhraseProperty->GetRightToLeftProbabilityDright()
+ << " R2L_Dleft " << orientationPhraseProperty->GetRightToLeftProbabilityDleft()
+ << std::endl);
+
+ const PhraseOrientationFeatureState* prevState =
+ static_cast<const PhraseOrientationFeatureState*>(prevHypo->GetFFState(featureID));
+
+
+ // LEFT-TO-RIGHT DIRECTION
+
+ Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::PhraseOrientation::REO_DIR_L2R);
+
+ IFFEATUREVERBOSE(2)
+ {
+ FEATUREVERBOSE(2, "l2rOrientation ");
+ switch (l2rOrientation)
+ {
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT:
+ FEATUREVERBOSE2(2, "mono" << std::endl);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
+ FEATUREVERBOSE2(2, "swap" << std::endl);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
+ FEATUREVERBOSE2(2, "dleft" << std::endl);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
+ FEATUREVERBOSE2(2, "dright" << std::endl);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
+ // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR
+ FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
+ break;
+ default:
+ UTIL_THROW2(GetScoreProducerDescription()
+ << ": Unsupported orientation type.");
+ break;
+ }
+ }
+
+ bool delayedScoringL2R = false;
+
+ if ( ((targetIndex == 0) || !phraseOrientation.TargetSpanIsAligned(0,targetIndex)) // boundary non-terminal in rule-initial position (left boundary)
+ && (currTarPhrLHS != m_glueTargetLHS) ) // and not glue rule
+ {
+ // delay left-to-right scoring
+
+ FEATUREVERBOSE(3, "Left boundary");
+ if (targetIndex != 0) {
+ FEATUREVERBOSE2(3, " (with targetIndex!=0)");
+ }
+ FEATUREVERBOSE2(3, std::endl);
+
+ bool previousSourceSpanIsAligned = ( (sourceIndex > 0) && phraseOrientation.SourceSpanIsAligned(0,sourceIndex-1) );
+ bool followingSourceSpanIsAligned = ( (sourceIndex < currSrcPhr->GetSize()-1) && phraseOrientation.SourceSpanIsAligned(sourceIndex,currSrcPhr->GetSize()-1) );
+
+ FEATUREVERBOSE(4, "previousSourceSpanIsAligned = " << previousSourceSpanIsAligned << std::endl);
+ FEATUREVERBOSE(4, "followingSourceSpanIsAligned = " << followingSourceSpanIsAligned << std::endl;);
+
+ if (previousSourceSpanIsAligned && followingSourceSpanIsAligned)
+ {
+ // discontinuous
+ l2rOrientation = Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
+ }
+ else
+ {
+ FEATUREVERBOSE(3, "Delaying left-to-right scoring" << std::endl);
+
+ delayedScoringL2R = true;
+ std::bitset<3> possibleFutureOrientationsL2R(0x7);
+ possibleFutureOrientationsL2R[0] = !previousSourceSpanIsAligned;
+ possibleFutureOrientationsL2R[1] = !followingSourceSpanIsAligned;
+
+ // add heuristic scores
+
+ std::vector<float> weightsVector = StaticData::Instance().GetAllWeights().GetScoresForProducer(this);
+ std::vector<float> scoresL2R;
+ scoresL2R.push_back( std::log(orientationPhraseProperty->GetLeftToRightProbabilityMono()) );
+ scoresL2R.push_back( std::log(orientationPhraseProperty->GetLeftToRightProbabilitySwap()) );
+ scoresL2R.push_back( std::log(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous()) );
+ std::vector<float> weightedScoresL2R;
+ for ( size_t i=0; i<3;++i )
+ {
+ weightedScoresL2R.push_back( weightsVector[i] * scoresL2R[i] );
+ }
+
+ size_t heuristicScoreIndex = 0;
+ for (size_t i=1; i<3; ++i)
+ {
+ if (possibleFutureOrientationsL2R[i])
+ {
+ if (weightedScoresL2R[i] > weightedScoresL2R[heuristicScoreIndex])
+ {
+ heuristicScoreIndex = i;
+ }
+ }
+ }
+
+ IFFEATUREVERBOSE(5)
+ {
+ FEATUREVERBOSE(5, "Heuristic score computation (L2R): "
+ << "heuristicScoreIndex= " << heuristicScoreIndex);
+ for (size_t i=0; i<3; ++i)
+ FEATUREVERBOSE2(5, " weightsVector[" << i << "]= " << weightsVector[i]);
+ for (size_t i=0; i<3; ++i)
+ FEATUREVERBOSE2(5, " scoresL2R[" << i << "]= " << scoresL2R[i]);
+ for (size_t i=0; i<3; ++i)
+ FEATUREVERBOSE2(5, " weightedScoresL2R[" << i << "]= " << weightedScoresL2R[i]);
+ for (size_t i=0; i<3; ++i)
+ FEATUREVERBOSE2(5, " possibleFutureOrientationsL2R[" << i << "]= " << possibleFutureOrientationsL2R[i]);
+ if ( possibleFutureOrientationsL2R == 0x7 )
+ {
+ FEATUREVERBOSE2(5, " (all orientations possible)");
+ }
+ FEATUREVERBOSE2(5, std::endl);
+ }
+
+ newScores[heuristicScoreIndex] += scoresL2R[heuristicScoreIndex];
+ state->SetLeftBoundaryL2R(scoresL2R, heuristicScoreIndex, possibleFutureOrientationsL2R, nonTermIndex);
+
+ if ( (possibleFutureOrientationsL2R & prevState->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations) == 0x4 )
+ {
+ // recursive: discontinuous orientation
+ FEATUREVERBOSE(5, "previous state: L2R discontinuous orientation "
+ << possibleFutureOrientationsL2R << " & " << prevState->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations
+ << " = " << (possibleFutureOrientationsL2R & prevState->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations)
+ << std::endl);
+ LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores);
+ state->m_leftBoundaryRecursionGuard = true; // prevent subderivation from being scored recursively multiple times
+ }
+ }
+ }
+
+ if (!delayedScoringL2R)
+ {
+ switch (l2rOrientation)
+ {
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT:
+ newScores[0] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityMono());
+ // if sub-derivation has left-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x1, newScores);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
+ newScores[1] += std::log(orientationPhraseProperty->GetLeftToRightProbabilitySwap());
+ // if sub-derivation has left-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x2, newScores);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
+ newScores[2] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous());
+ // if sub-derivation has left-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
+ newScores[2] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous());
+ // if sub-derivation has left-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
+ // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR
+ newScores[2] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous());
+ // if sub-derivation has left-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores);
+ break;
+ default:
+ UTIL_THROW2(GetScoreProducerDescription()
+ << ": Unsupported orientation type.");
+ break;
+ }
+ }
+
+
+ // RIGHT-TO-LEFT DIRECTION
+
+ Moses::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::PhraseOrientation::REO_DIR_R2L);
+
+ IFFEATUREVERBOSE(2)
+ {
+ FEATUREVERBOSE(2, "r2lOrientation ");
+ switch (r2lOrientation)
+ {
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT:
+ FEATUREVERBOSE2(2, "mono" << std::endl);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
+ FEATUREVERBOSE2(2, "swap" << std::endl);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
+ FEATUREVERBOSE2(2, "dleft" << std::endl);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
+ FEATUREVERBOSE2(2, "dright" << std::endl);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
+ // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR
+ FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
+ break;
+ default:
+ UTIL_THROW2(GetScoreProducerDescription()
+ << ": Unsupported orientation type.");
+ break;
+ }
+ }
+
+ bool delayedScoringR2L = false;
+
+ if ( ((targetIndex == currTarPhr.GetSize()-1) || !phraseOrientation.TargetSpanIsAligned(targetIndex,currTarPhr.GetSize()-1)) // boundary non-terminal in rule-final position (right boundary)
+ && (currTarPhrLHS != m_glueTargetLHS) ) // and not glue rule
+ {
+ // delay right-to-left scoring
+
+ FEATUREVERBOSE(3, "Right boundary");
+ if (targetIndex != currTarPhr.GetSize()-1) {
+ FEATUREVERBOSE2(3, " (with targetIndex!=currTarPhr.GetSize()-1)");
+ }
+ FEATUREVERBOSE2(3, std::endl);
+
+ bool previousSourceSpanIsAligned = ( (sourceIndex > 0) && phraseOrientation.SourceSpanIsAligned(0,sourceIndex-1) );
+ bool followingSourceSpanIsAligned = ( (sourceIndex < currSrcPhr->GetSize()-1) && phraseOrientation.SourceSpanIsAligned(sourceIndex,currSrcPhr->GetSize()-1) );
+
+ FEATUREVERBOSE(4, "previousSourceSpanIsAligned = " << previousSourceSpanIsAligned << std::endl);
+ FEATUREVERBOSE(4, "followingSourceSpanIsAligned = " << followingSourceSpanIsAligned << std::endl;);
+
+ if (previousSourceSpanIsAligned && followingSourceSpanIsAligned)
+ {
+ // discontinuous
+ r2lOrientation = Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
+ }
+ else
+ {
+ FEATUREVERBOSE(3, "Delaying right-to-left scoring" << std::endl);
+
+ delayedScoringR2L = true;
+ std::bitset<3> possibleFutureOrientationsR2L(0x7);
+ possibleFutureOrientationsR2L[0] = !followingSourceSpanIsAligned;
+ possibleFutureOrientationsR2L[1] = !previousSourceSpanIsAligned;
+
+ // add heuristic scores
+
+ std::vector<float> weightsVector = StaticData::Instance().GetAllWeights().GetScoresForProducer(this);
+ std::vector<float> scoresR2L;
+ scoresR2L.push_back( std::log(orientationPhraseProperty->GetRightToLeftProbabilityMono()) );
+ scoresR2L.push_back( std::log(orientationPhraseProperty->GetRightToLeftProbabilitySwap()) );
+ scoresR2L.push_back( std::log(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous()) );
+ std::vector<float> weightedScoresR2L;
+ for ( size_t i=0; i<3;++i )
+ {
+ weightedScoresR2L.push_back( weightsVector[m_offsetR2LScores+i] * scoresR2L[i] );
+ }
+
+ size_t heuristicScoreIndex = 0;
+ for (size_t i=1; i<3; ++i)
+ {
+ if (possibleFutureOrientationsR2L[i])
+ {
+ if (weightedScoresR2L[i] > weightedScoresR2L[heuristicScoreIndex])
+ {
+ heuristicScoreIndex = i;
+ }
+ }
+ }
+
+ IFFEATUREVERBOSE(5)
+ {
+ FEATUREVERBOSE(5, "Heuristic score computation (R2L): "
+ << "heuristicScoreIndex= " << heuristicScoreIndex);
+ for (size_t i=0; i<3; ++i)
+ FEATUREVERBOSE2(5, " weightsVector[" << m_offsetR2LScores+i << "]= " << weightsVector[m_offsetR2LScores+i]);
+ for (size_t i=0; i<3; ++i)
+ FEATUREVERBOSE2(5, " scoresR2L[" << i << "]= " << scoresR2L[i]);
+ for (size_t i=0; i<3; ++i)
+ FEATUREVERBOSE2(5, " weightedScoresR2L[" << i << "]= " << weightedScoresR2L[i]);
+ for (size_t i=0; i<3; ++i)
+ FEATUREVERBOSE2(5, " possibleFutureOrientationsR2L[" << i << "]= " << possibleFutureOrientationsR2L[i]);
+ if ( possibleFutureOrientationsR2L == 0x7 )
+ {
+ FEATUREVERBOSE2(5, " (all orientations possible)");
+ }
+ FEATUREVERBOSE2(5, std::endl);
+ }
+
+ newScores[m_offsetR2LScores+heuristicScoreIndex] += scoresR2L[heuristicScoreIndex];
+ state->SetRightBoundaryR2L(scoresR2L, heuristicScoreIndex, possibleFutureOrientationsR2L, nonTermIndex);
+
+ if ( (possibleFutureOrientationsR2L & prevState->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations) == 0x4 )
+ {
+ // recursive: discontinuous orientation
+ FEATUREVERBOSE(5, "previous state: R2L discontinuous orientation "
+ << possibleFutureOrientationsR2L << " & " << prevState->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations
+ << " = " << (possibleFutureOrientationsR2L & prevState->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations)
+ << std::endl);
+ RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores);
+ state->m_rightBoundaryRecursionGuard = true; // prevent subderivation from being scored recursively multiple times
+ }
+ }
+ }
+
+ if (!delayedScoringR2L)
+ {
+ switch (r2lOrientation)
+ {
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT:
+ newScores[m_offsetR2LScores+0] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityMono());
+ // if sub-derivation has right-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x1, newScores);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
+ newScores[m_offsetR2LScores+1] += std::log(orientationPhraseProperty->GetRightToLeftProbabilitySwap());
+ // if sub-derivation has right-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x2, newScores);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
+ newScores[m_offsetR2LScores+2] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous());
+ // if sub-derivation has right-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
+ newScores[m_offsetR2LScores+2] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous());
+ // if sub-derivation has right-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores);
+ break;
+ case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
+ // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR
+ newScores[m_offsetR2LScores+2] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous());
+ // if sub-derivation has right-boundary non-terminal:
+ // add recursive actual score of boundary non-terminal from subderivation
+ RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores);
+ break;
+ default:
+ UTIL_THROW2(GetScoreProducerDescription()
+ << ": Unsupported orientation type.");
+ break;
+ }
+ }
+ }
+ else
+ {
+ // abort with error message if the phrase does not translate an unknown word
+ UTIL_THROW_IF2(!prevTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
+ << ": Missing Orientation property. "
+ << "Please check phrase table and glue rules.");
+ }
+ }
+
+ accumulator->PlusEquals(this, newScores);
+
+ return state;
+}
+
+void PhraseOrientationFeature::LeftBoundaryL2RScoreRecursive(int featureID,
+ const ChartHypothesis *hypo,
+ const PhraseOrientationFeatureState *state,
+ const std::bitset<3> orientation,
+ std::vector<float>& newScores) const
+{
+ if (state->m_leftBoundaryIsSet)
+ {
+ // subtract heuristic score from subderivation
+ newScores[state->m_leftBoundaryNonTerminalL2RHeuristicScoreIndex] -= state->m_leftBoundaryNonTerminalL2RScores[state->m_leftBoundaryNonTerminalL2RHeuristicScoreIndex];
+
+ // add actual score
+ std::bitset<3> recursiveOrientation = orientation;
+ if ( (orientation == 0x4) || (orientation == 0x0) )
+ {
+ // discontinuous
+ newScores[2] += state->GetLeftBoundaryL2RScoreDiscontinuous();
+ }
+ else
+ {
+ recursiveOrientation &= state->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations;
+ if ( recursiveOrientation == 0x1 )
+ {
+ // monotone
+ newScores[0] += state->GetLeftBoundaryL2RScoreMono();
+ }
+ else if ( recursiveOrientation == 0x2 )
+ {
+ // swap
+ newScores[1] += state->GetLeftBoundaryL2RScoreSwap();
+ }
+ else if ( recursiveOrientation == 0x4 )
+ {
+ // discontinuous
+ newScores[2] += state->GetLeftBoundaryL2RScoreDiscontinuous();
+ }
+ else if ( recursiveOrientation == 0x0 )
+ {
+ // discontinuous
+ newScores[2] += state->GetLeftBoundaryL2RScoreDiscontinuous();
+ }
+ else
+ {
+ UTIL_THROW2(GetScoreProducerDescription()
+ << ": Error in recursive scoring.");
+ }
+ }
+
+ FEATUREVERBOSE(6, "Left boundary recursion: " << orientation << " & " << state->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations << " = " << recursiveOrientation
+ << " --- Subtracted heuristic score: " << state->m_leftBoundaryNonTerminalL2RScores[state->m_leftBoundaryNonTerminalL2RHeuristicScoreIndex] << std::endl);
+
+ if (!state->m_leftBoundaryRecursionGuard)
+ {
+ // recursive call
+ const ChartHypothesis *prevHypo = hypo->GetPrevHypo(state->m_leftBoundaryNonTerminalIndex);
+ const PhraseOrientationFeatureState* prevState =
+ static_cast<const PhraseOrientationFeatureState*>(prevHypo->GetFFState(featureID));
+
+ LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, recursiveOrientation, newScores);
+ }
+ else
+ {
+ FEATUREVERBOSE(6, "m_leftBoundaryRecursionGuard" << std::endl);
+ }
+ }
+}
+
+void PhraseOrientationFeature::RightBoundaryR2LScoreRecursive(int featureID,
+ const ChartHypothesis *hypo,
+ const PhraseOrientationFeatureState *state,
+ const std::bitset<3> orientation,
+ std::vector<float>& newScores) const
+{
+ if (state->m_rightBoundaryIsSet)
+ {
+ // subtract heuristic score from subderivation
+ newScores[m_offsetR2LScores+state->m_rightBoundaryNonTerminalR2LHeuristicScoreIndex] -= state->m_rightBoundaryNonTerminalR2LScores[state->m_rightBoundaryNonTerminalR2LHeuristicScoreIndex];
+
+ // add actual score
+ std::bitset<3> recursiveOrientation = orientation;
+ if ( (orientation == 0x4) || (orientation == 0x0) )
+ {
+ // discontinuous
+ newScores[m_offsetR2LScores+2] += state->GetRightBoundaryR2LScoreDiscontinuous();
+ }
+ else
+ {
+ recursiveOrientation &= state->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations;
+ if ( recursiveOrientation == 0x1 )
+ {
+ // monotone
+ newScores[m_offsetR2LScores+0] += state->GetRightBoundaryR2LScoreMono();
+ }
+ else if ( recursiveOrientation == 0x2 )
+ {
+ // swap
+ newScores[m_offsetR2LScores+1] += state->GetRightBoundaryR2LScoreSwap();
+ }
+ else if ( recursiveOrientation == 0x4 )
+ {
+ // discontinuous
+ newScores[m_offsetR2LScores+2] += state->GetRightBoundaryR2LScoreDiscontinuous();
+ }
+ else if ( recursiveOrientation == 0x0 )
+ {
+ // discontinuous
+ newScores[m_offsetR2LScores+2] += state->GetRightBoundaryR2LScoreDiscontinuous();
+ }
+ else
+ {
+ UTIL_THROW2(GetScoreProducerDescription()
+ << ": Error in recursive scoring.");
+ }
+ }
+
+ FEATUREVERBOSE(6, "Right boundary recursion: " << orientation << " & " << state->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations << " = " << recursiveOrientation
+ << " --- Subtracted heuristic score: " << state->m_rightBoundaryNonTerminalR2LScores[state->m_rightBoundaryNonTerminalR2LHeuristicScoreIndex] << std::endl);
+
+ if (!state->m_rightBoundaryRecursionGuard)
+ {
+ // recursive call
+ const ChartHypothesis *prevHypo = hypo->GetPrevHypo(state->m_rightBoundaryNonTerminalIndex);
+ const PhraseOrientationFeatureState* prevState =
+ static_cast<const PhraseOrientationFeatureState*>(prevHypo->GetFFState(featureID));
+
+ RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, recursiveOrientation, newScores);
+ }
+ else
+ {
+ FEATUREVERBOSE(6, "m_rightBoundaryRecursionGuard" << std::endl);
+ }
+ }
+}
+
+
+}
+
diff --git a/moses/FF/PhraseOrientationFeature.h b/moses/FF/PhraseOrientationFeature.h
new file mode 100644
index 000000000..8b363056e
--- /dev/null
+++ b/moses/FF/PhraseOrientationFeature.h
@@ -0,0 +1,211 @@
+//
+// REFERENCE
+// ---------
+// When using this feature, please cite:
+//
+// Matthias Huck, Joern Wuebker, Felix Rietig, and Hermann Ney.
+// A Phrase Orientation Model for Hierarchical Machine Translation.
+// In ACL 2013 Eighth Workshop on Statistical Machine Translation (WMT 2013), pages 452-463, Sofia, Bulgaria, August 2013.
+//
+
+#pragma once
+
+#include <bitset>
+#include <string>
+#include <vector>
+#include "StatefulFeatureFunction.h"
+#include "FFState.h"
+#include "moses/Factor.h"
+#include "phrase-extract/extract-ghkm/PhraseOrientation.h"
+
+
+namespace Moses
+{
+
+class PhraseOrientationFeatureState : public FFState
+{
+public:
+
+ friend class PhraseOrientationFeature;
+
+ PhraseOrientationFeatureState()
+ : m_leftBoundaryNonTerminalL2RScores(3,0)
+ , m_rightBoundaryNonTerminalR2LScores(3,0)
+ , m_leftBoundaryNonTerminalL2RPossibleFutureOrientations(0x7)
+ , m_rightBoundaryNonTerminalR2LPossibleFutureOrientations(0x7)
+ , m_leftBoundaryRecursionGuard(false)
+ , m_rightBoundaryRecursionGuard(false)
+ , m_leftBoundaryIsSet(false)
+ , m_rightBoundaryIsSet(false)
+ {}
+
+ void SetLeftBoundaryL2R(const std::vector<float> &scores,
+ size_t heuristicScoreIndex,
+ std::bitset<3> &possibleFutureOrientations,
+ size_t nonTerminalIndex)
+ {
+ for (size_t i=0; i<3; ++i)
+ {
+ m_leftBoundaryNonTerminalL2RScores[i] = scores[i];
+ m_leftBoundaryNonTerminalL2RPossibleFutureOrientations[i] = possibleFutureOrientations[i];
+ }
+ m_leftBoundaryNonTerminalL2RHeuristicScoreIndex = heuristicScoreIndex;
+ m_leftBoundaryNonTerminalIndex = nonTerminalIndex;
+ m_leftBoundaryIsSet = true;
+ }
+
+ void SetRightBoundaryR2L(const std::vector<float> &scores,
+ size_t heuristicScoreIndex,
+ std::bitset<3> &possibleFutureOrientations,
+ size_t nonTerminalIndex)
+ {
+ for (size_t i=0; i<3; ++i)
+ {
+ m_rightBoundaryNonTerminalR2LScores[i] = scores[i];
+ m_rightBoundaryNonTerminalR2LPossibleFutureOrientations[i] = possibleFutureOrientations[i];
+ }
+ m_rightBoundaryNonTerminalR2LHeuristicScoreIndex = heuristicScoreIndex;
+ m_rightBoundaryNonTerminalIndex = nonTerminalIndex;
+ m_rightBoundaryIsSet = true;
+ }
+
+
+ float GetLeftBoundaryL2RScoreMono() const
+ {
+ return m_leftBoundaryNonTerminalL2RScores[0];
+ }
+
+ float GetLeftBoundaryL2RScoreSwap() const
+ {
+ return m_leftBoundaryNonTerminalL2RScores[1];
+ }
+
+ float GetLeftBoundaryL2RScoreDiscontinuous() const
+ {
+ return m_leftBoundaryNonTerminalL2RScores[2];
+ }
+
+
+ float GetRightBoundaryR2LScoreMono() const
+ {
+ return m_rightBoundaryNonTerminalR2LScores[0];
+ }
+
+ float GetRightBoundaryR2LScoreSwap() const
+ {
+ return m_rightBoundaryNonTerminalR2LScores[1];
+ }
+
+ float GetRightBoundaryR2LScoreDiscontinuous() const
+ {
+ return m_rightBoundaryNonTerminalR2LScores[2];
+ }
+
+
+ int Compare(const FFState& other) const
+ {
+ return 0;
+ };
+
+private:
+
+ template<std::size_t N> bool Smaller(const std::bitset<N>& x, const std::bitset<N>& y) const
+ {
+ for (size_t i=0; i<N; ++i)
+ {
+ if (x[i] ^ y[i])
+ return y[i];
+ }
+ return false;
+ }
+
+ std::vector<float> m_leftBoundaryNonTerminalL2RScores;
+ std::vector<float> m_rightBoundaryNonTerminalR2LScores;
+
+ size_t m_leftBoundaryNonTerminalL2RHeuristicScoreIndex;
+ size_t m_rightBoundaryNonTerminalR2LHeuristicScoreIndex;
+
+ std::bitset<3> m_leftBoundaryNonTerminalL2RPossibleFutureOrientations;
+ std::bitset<3> m_rightBoundaryNonTerminalR2LPossibleFutureOrientations;
+
+ size_t m_leftBoundaryNonTerminalIndex;
+ size_t m_rightBoundaryNonTerminalIndex;
+ bool m_leftBoundaryRecursionGuard;
+ bool m_rightBoundaryRecursionGuard;
+ bool m_leftBoundaryIsSet;
+ bool m_rightBoundaryIsSet;
+};
+
+
+
+class PhraseOrientationFeature : public StatefulFeatureFunction
+{
+public:
+
+ PhraseOrientationFeature(const std::string &line);
+
+ ~PhraseOrientationFeature() {
+ }
+
+ bool IsUseable(const FactorMask &mask) const {
+ return true;
+ }
+
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const {
+ return new PhraseOrientationFeatureState();
+ }
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+ {
+ targetPhrase.SetRuleSource(source);
+ };
+
+ void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const
+ {};
+
+ FFState* EvaluateWhenApplied(
+ const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const
+ {
+ return new PhraseOrientationFeatureState();
+ };
+
+ FFState* EvaluateWhenApplied(
+ const ChartHypothesis& cur_hypo,
+ int featureID, // used to index the state in the previous hypotheses
+ ScoreComponentCollection* accumulator) const;
+
+protected:
+
+ void LeftBoundaryL2RScoreRecursive(int featureID,
+ const ChartHypothesis *hypo,
+ const PhraseOrientationFeatureState *state,
+ const std::bitset<3> orientation,
+ std::vector<float>& newScores) const;
+
+ void RightBoundaryR2LScoreRecursive(int featureID,
+ const ChartHypothesis *hypo,
+ const PhraseOrientationFeatureState *state,
+ const std::bitset<3> orientation,
+ std::vector<float>& newScores) const;
+
+ std::string m_glueTargetLHSStr;
+ Word m_glueTargetLHS;
+ size_t m_offsetR2LScores;
+
+};
+
+
+}
+
diff --git a/moses/FF/PhrasePairFeature.cpp b/moses/FF/PhrasePairFeature.cpp
index 9277e19f2..6daab7e25 100644
--- a/moses/FF/PhrasePairFeature.cpp
+++ b/moses/FF/PhrasePairFeature.cpp
@@ -106,12 +106,14 @@ void PhrasePairFeature::Load()
}
}
-void PhrasePairFeature::Evaluate(
- const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
+void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
{
- const TargetPhrase& target = hypo.GetCurrTargetPhrase();
- const Phrase& source = hypo.GetTranslationOption().GetInputPath().GetPhrase();
+ const Phrase& source = inputPath.GetPhrase();
if (m_simple) {
ostringstream namestr;
namestr << "pp_";
@@ -122,17 +124,17 @@ void PhrasePairFeature::Evaluate(
namestr << sourceFactor->GetString();
}
namestr << "~";
- namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
- for (size_t i = 1; i < target.GetSize(); ++i) {
- const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
+ namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+ for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
+ const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
}
- accumulator->SparsePlusEquals(namestr.str(),1);
+ scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
if (m_domainTrigger) {
- const Sentence& input = static_cast<const Sentence&>(hypo.GetInput());
+ const Sentence& input = static_cast<const Sentence&>(input);
const bool use_topicid = input.GetUseTopicId();
const bool use_topicid_prob = input.GetUseTopicIdAndProb();
@@ -145,9 +147,9 @@ void PhrasePairFeature::Evaluate(
pair << sourceFactor->GetString();
}
pair << "~";
- pair << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
- for (size_t i = 1; i < target.GetSize(); ++i) {
- const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
+ pair << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+ for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
+ const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
pair << ",";
pair << targetFactor->GetString();
}
@@ -165,7 +167,7 @@ void PhrasePairFeature::Evaluate(
feature << "_";
feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
// use topic probabilities
const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
@@ -173,7 +175,7 @@ void PhrasePairFeature::Evaluate(
stringstream feature;
feature << "pp_unk_";
feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
stringstream feature;
@@ -181,7 +183,7 @@ void PhrasePairFeature::Evaluate(
feature << topicid_prob[i];
feature << "_";
feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
+ scoreBreakdown.SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
}
}
}
@@ -195,12 +197,12 @@ void PhrasePairFeature::Evaluate(
namestr << sourceTrigger;
namestr << "_";
namestr << pair.str();
- accumulator->SparsePlusEquals(namestr.str(),1);
+ scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
}
}
if (m_sourceContext) {
- const Sentence& input = static_cast<const Sentence&>(hypo.GetInput());
+ const Sentence& input = static_cast<const Sentence&>(input);
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
@@ -229,14 +231,14 @@ void PhrasePairFeature::Evaluate(
namestr << sourceFactor->GetString();
}
namestr << "~";
- namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
- for (size_t i = 1; i < target.GetSize(); ++i) {
- const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
+ namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+ for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
+ const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
}
- accumulator->SparsePlusEquals(namestr.str(),1);
+ scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
}
}
diff --git a/moses/FF/PhrasePairFeature.h b/moses/FF/PhrasePairFeature.h
index 7790e9035..b0f380d0a 100644
--- a/moses/FF/PhrasePairFeature.h
+++ b/moses/FF/PhrasePairFeature.h
@@ -35,31 +35,32 @@ class PhrasePairFeature: public StatelessFeatureFunction
public:
PhrasePairFeature(const std::string &line);
- bool IsUseable(const FactorMask &mask) const;
+ void Load();
+ void SetParameter(const std::string& key, const std::string& value);
- void Evaluate(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const;
+ bool IsUseable(const FactorMask &mask) const;
- void EvaluateChart(const ChartHypothesis& hypo,
- ScoreComponentCollection*) const {
- throw std::logic_error("PhrasePairFeature not valid in chart decoder");
- }
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+ {}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore = NULL) const
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+ void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
{}
- void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+
+ void EvaluateWhenApplied(const ChartHypothesis& hypo,
+ ScoreComponentCollection*) const
{}
- void Load();
- void SetParameter(const std::string& key, const std::string& value);
};
diff --git a/moses/FF/PhrasePenalty.cpp b/moses/FF/PhrasePenalty.cpp
index 259c94d4c..cd1b735df 100644
--- a/moses/FF/PhrasePenalty.cpp
+++ b/moses/FF/PhrasePenalty.cpp
@@ -1,24 +1,53 @@
+#include <vector>
#include "PhrasePenalty.h"
-#include "moses/TargetPhrase.h"
#include "moses/ScoreComponentCollection.h"
+#include "moses/TranslationModel/PhraseDictionary.h"
+#include "util/exception.hh"
using namespace std;
namespace Moses
{
PhrasePenalty::PhrasePenalty(const std::string &line)
- : StatelessFeatureFunction(1, line)
+: StatelessFeatureFunction(1, line)
+, m_perPhraseTable(false)
{
ReadParameters();
}
-void PhrasePenalty::Evaluate(const Phrase &source
+void PhrasePenalty::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{
- scoreBreakdown.Assign(this, - 1.0f);
+ if (m_perPhraseTable) {
+ const PhraseDictionary *pt = targetPhrase.GetContainer();
+ if (pt) {
+ size_t ptId = pt->GetId();
+ UTIL_THROW_IF2(ptId >= m_numScoreComponents, "Wrong number of scores");
+
+ vector<float> scores(m_numScoreComponents, 0);
+ scores[ptId] = 1.0f;
+
+ scoreBreakdown.Assign(this, scores);
+ }
+
+ }
+ else {
+ scoreBreakdown.Assign(this, 1.0f);
+ }
}
+void PhrasePenalty::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "per-phrase-table") {
+ m_perPhraseTable =Scan<bool>(value);
+ }
+ else {
+ StatelessFeatureFunction::SetParameter(key, value);
+ }
}
+
+} // namespace
+
diff --git a/moses/FF/PhrasePenalty.h b/moses/FF/PhrasePenalty.h
index b7fbdf6b3..80635b4e1 100644
--- a/moses/FF/PhrasePenalty.h
+++ b/moses/FF/PhrasePenalty.h
@@ -14,20 +14,22 @@ public:
return true;
}
- virtual void Evaluate(const Phrase &source
+ virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
-
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+ void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
ScoreComponentCollection* accumulator) const
{}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
@@ -35,6 +37,10 @@ public:
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
+ void SetParameter(const std::string& key, const std::string& value);
+
+protected:
+ bool m_perPhraseTable;
};
} //namespace
diff --git a/moses/FF/ReferenceComparison.h b/moses/FF/ReferenceComparison.h
index 7523bfcca..62cf15ced 100644
--- a/moses/FF/ReferenceComparison.h
+++ b/moses/FF/ReferenceComparison.h
@@ -10,37 +10,35 @@ namespace Moses
class ReferenceComparison : public StatelessFeatureFunction
{
public:
- ReferenceComparison(const std::string &line);
-
- virtual bool IsUseable(const FactorMask &mask) const {
- return true;
- }
-
- virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
- {}
-
- virtual void Evaluate(const InputType &input
- , const InputPath &inputPath
- , const TargetPhrase &targetPhrase
- , const StackVec *stackVec
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore = NULL) const
- {}
-
- virtual void Evaluate(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
- {}
-
- virtual void EvaluateChart(const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const
- {}
-
- std::vector<float> DefaultWeights() const {
- return std::vector<float>();
- }
+ ReferenceComparison(const std::string &line);
+
+ virtual bool IsUseable(const FactorMask &mask) const
+ { return true; }
+
+ virtual void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+ {}
+
+ virtual void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const
+ {}
+
+ virtual void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ std::vector<float> DefaultWeights() const
+ { return std::vector<float>(); }
protected:
diff --git a/moses/FF/RuleScope.cpp b/moses/FF/RuleScope.cpp
index 6c45d183a..ed329c4ca 100644
--- a/moses/FF/RuleScope.cpp
+++ b/moses/FF/RuleScope.cpp
@@ -5,8 +5,8 @@
namespace Moses
{
RuleScope::RuleScope(const std::string &line)
- :StatelessFeatureFunction(1, line)
- ,m_sourceSyntax(true)
+:StatelessFeatureFunction(1, line)
+,m_sourceSyntax(true)
{
}
@@ -16,10 +16,10 @@ bool IsAmbiguous(const Word &word, bool sourceSyntax)
return word.IsNonTerminal() && (!sourceSyntax || word == inputDefaultNonTerminal);
}
-void RuleScope::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+void RuleScope::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
// adjacent non-term count as 1 ammbiguity, rather than 2 as in rule scope
// source can't be empty, right?
@@ -27,22 +27,23 @@ void RuleScope::Evaluate(const Phrase &source
int count = 0;
for (size_t i = 0; i < source.GetSize() - 0; ++i) {
- const Word &word = source.GetWord(i);
- bool ambiguous = IsAmbiguous(word, m_sourceSyntax);
- if (ambiguous) {
- ++count;
- } else {
- if (count > 0) {
- score += count;
- }
- count = -1;
- }
+ const Word &word = source.GetWord(i);
+ bool ambiguous = IsAmbiguous(word, m_sourceSyntax);
+ if (ambiguous) {
+ ++count;
+ }
+ else {
+ if (count > 0) {
+ score += count;
+ }
+ count = -1;
+ }
}
// 1st & last always adjacent to ambiguity
++count;
if (count > 0) {
- score += count;
+ score += count;
}
scoreBreakdown.PlusEquals(this, score);
@@ -51,7 +52,7 @@ void RuleScope::Evaluate(const Phrase &source
void RuleScope::SetParameter(const std::string& key, const std::string& value)
{
if (key == "source-syntax") {
- m_sourceSyntax = Scan<bool>(value);
+ m_sourceSyntax = Scan<bool>(value);
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
diff --git a/moses/FF/RuleScope.h b/moses/FF/RuleScope.h
index 2e08f2c11..a2c9e06f3 100644
--- a/moses/FF/RuleScope.h
+++ b/moses/FF/RuleScope.h
@@ -9,34 +9,33 @@ namespace Moses
class RuleScope : public StatelessFeatureFunction
{
public:
- RuleScope(const std::string &line);
-
- virtual bool IsUseable(const FactorMask &mask) const {
- return true;
- }
-
- virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
-
- virtual void Evaluate(const InputType &input
- , const InputPath &inputPath
- , const TargetPhrase &targetPhrase
- , const StackVec *stackVec
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore = NULL) const
- {}
-
- virtual void Evaluate(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
- {}
-
- virtual void EvaluateChart(const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const
- {}
-
- void SetParameter(const std::string& key, const std::string& value);
+ RuleScope(const std::string &line);
+
+ virtual bool IsUseable(const FactorMask &mask) const
+ { return true; }
+
+ virtual void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
+
+ virtual void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const
+ {}
+
+ virtual void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ void SetParameter(const std::string& key, const std::string& value);
protected:
bool m_sourceSyntax;
diff --git a/moses/FF/SetSourcePhrase.cpp b/moses/FF/SetSourcePhrase.cpp
index 9034835c0..f89683f28 100644
--- a/moses/FF/SetSourcePhrase.cpp
+++ b/moses/FF/SetSourcePhrase.cpp
@@ -4,18 +4,18 @@
namespace Moses
{
SetSourcePhrase::SetSourcePhrase(const std::string &line)
- :StatelessFeatureFunction(0, line)
+:StatelessFeatureFunction(0, line)
{
m_tuneable = false;
ReadParameters();
}
-void SetSourcePhrase::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+void SetSourcePhrase::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
- targetPhrase.SetRuleSource(source);
+ targetPhrase.SetRuleSource(source);
}
}
diff --git a/moses/FF/SetSourcePhrase.h b/moses/FF/SetSourcePhrase.h
index e11695a22..81f293dde 100644
--- a/moses/FF/SetSourcePhrase.h
+++ b/moses/FF/SetSourcePhrase.h
@@ -11,34 +11,32 @@ class SetSourcePhrase : public StatelessFeatureFunction
public:
SetSourcePhrase(const std::string &line);
- virtual bool IsUseable(const FactorMask &mask) const {
- return true;
- }
-
- virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
-
- virtual void Evaluate(const InputType &input
- , const InputPath &inputPath
- , const TargetPhrase &targetPhrase
- , const StackVec *stackVec
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore = NULL) const
+ virtual bool IsUseable(const FactorMask &mask) const
+ { return true; }
+
+ virtual void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
+
+ virtual void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- virtual void Evaluate(const Hypothesis& hypo,
+ virtual void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- virtual void EvaluateChart(const ChartHypothesis &hypo,
+ virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
- std::vector<float> DefaultWeights() const {
- return std::vector<float>();
- }
+ std::vector<float> DefaultWeights() const
+ { return std::vector<float>(); }
};
diff --git a/moses/FF/SkeletonChangeInput.cpp b/moses/FF/SkeletonChangeInput.cpp
new file mode 100644
index 000000000..74a85ba5e
--- /dev/null
+++ b/moses/FF/SkeletonChangeInput.cpp
@@ -0,0 +1,92 @@
+#include <vector>
+#include "SkeletonChangeInput.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/TargetPhrase.h"
+#include "moses/Sentence.h"
+#include "moses/FactorCollection.h"
+#include "util/exception.hh"
+
+using namespace std;
+
+namespace Moses
+{
+SkeletonChangeInput::SkeletonChangeInput(const std::string &line)
+ :StatelessFeatureFunction(2, line)
+{
+ ReadParameters();
+}
+
+void SkeletonChangeInput::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+{
+ // dense scores
+ vector<float> newScores(m_numScoreComponents);
+ newScores[0] = 1.5;
+ newScores[1] = 0.3;
+ scoreBreakdown.PlusEquals(this, newScores);
+
+ // sparse scores
+ scoreBreakdown.PlusEquals(this, "sparse-name", 2.4);
+
+}
+
+void SkeletonChangeInput::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
+{
+ if (targetPhrase.GetNumNonTerminals()) {
+ vector<float> newScores(m_numScoreComponents);
+ newScores[0] = - std::numeric_limits<float>::infinity();
+ scoreBreakdown.PlusEquals(this, newScores);
+ }
+
+}
+
+void SkeletonChangeInput::EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+{}
+
+void SkeletonChangeInput::EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+{}
+
+void SkeletonChangeInput::ChangeSource(InputType *&input) const
+{
+ // add factor[1] to each word. Created from first 4 letter of factor[0]
+
+ Sentence *sentence = dynamic_cast<Sentence*>(input);
+ UTIL_THROW_IF2(sentence == NULL, "Not a sentence input");
+
+ FactorCollection &fc = FactorCollection::Instance();
+
+ size_t size = sentence->GetSize();
+ for (size_t i = 0; i < size; ++i) {
+ Word &word = sentence->Phrase::GetWord(i);
+ const Factor *factor0 = word[0];
+
+ std::string str = factor0->GetString().as_string();
+ if (str.length() > 4) {
+ str = str.substr(0, 4);
+ }
+
+ const Factor *factor1 = fc.AddFactor(str);
+ word.SetFactor(1, factor1);
+ }
+}
+
+void SkeletonChangeInput::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "arg") {
+ // set value here
+ } else {
+ StatelessFeatureFunction::SetParameter(key, value);
+ }
+}
+
+}
+
diff --git a/moses/FF/SkeletonChangeInput.h b/moses/FF/SkeletonChangeInput.h
new file mode 100644
index 000000000..07b19e768
--- /dev/null
+++ b/moses/FF/SkeletonChangeInput.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <string>
+#include "StatelessFeatureFunction.h"
+
+namespace Moses
+{
+
+class SkeletonChangeInput : public StatelessFeatureFunction
+{
+public:
+ SkeletonChangeInput(const std::string &line);
+
+ bool IsUseable(const FactorMask &mask) const {
+ return true;
+ }
+
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
+
+ void ChangeSource(InputType *&input) const;
+
+ void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+ void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const;
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const;
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+};
+
+}
+
diff --git a/moses/FF/SkeletonStatefulFF.cpp b/moses/FF/SkeletonStatefulFF.cpp
index 44771e646..fe81aeeae 100644
--- a/moses/FF/SkeletonStatefulFF.cpp
+++ b/moses/FF/SkeletonStatefulFF.cpp
@@ -16,13 +16,20 @@ int SkeletonState::Compare(const FFState& other) const
return (m_targetLen < otherState.m_targetLen) ? -1 : +1;
}
-void SkeletonStatefulFF::Evaluate(const Phrase &source
+////////////////////////////////////////////////////////////////
+SkeletonStatefulFF::SkeletonStatefulFF(const std::string &line)
+ :StatefulFeatureFunction(3, line)
+{
+ ReadParameters();
+}
+
+void SkeletonStatefulFF::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
-void SkeletonStatefulFF::Evaluate(const InputType &input
+void SkeletonStatefulFF::EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
@@ -30,7 +37,7 @@ void SkeletonStatefulFF::Evaluate(const InputType &input
, ScoreComponentCollection *estimatedFutureScore) const
{}
-FFState* SkeletonStatefulFF::Evaluate(
+FFState* SkeletonStatefulFF::EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
@@ -49,7 +56,7 @@ FFState* SkeletonStatefulFF::Evaluate(
return new SkeletonState(0);
}
-FFState* SkeletonStatefulFF::EvaluateChart(
+FFState* SkeletonStatefulFF::EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
@@ -57,6 +64,14 @@ FFState* SkeletonStatefulFF::EvaluateChart(
return new SkeletonState(0);
}
+void SkeletonStatefulFF::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "arg") {
+ // set value here
+ } else {
+ StatefulFeatureFunction::SetParameter(key, value);
+ }
+}
}
diff --git a/moses/FF/SkeletonStatefulFF.h b/moses/FF/SkeletonStatefulFF.h
index 1f2baa92b..6fa26803e 100644
--- a/moses/FF/SkeletonStatefulFF.h
+++ b/moses/FF/SkeletonStatefulFF.h
@@ -21,9 +21,7 @@ public:
class SkeletonStatefulFF : public StatefulFeatureFunction
{
public:
- SkeletonStatefulFF(const std::string &line)
- :StatefulFeatureFunction(3, line)
- {}
+ SkeletonStatefulFF(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
return true;
@@ -32,25 +30,27 @@ public:
return new SkeletonState(0);
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
- FFState* Evaluate(
+ FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(
+ FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
+ void SetParameter(const std::string& key, const std::string& value);
+
};
diff --git a/moses/FF/SkeletonStatelessFF.cpp b/moses/FF/SkeletonStatelessFF.cpp
index 0ef8570ee..80c7d130e 100644
--- a/moses/FF/SkeletonStatelessFF.cpp
+++ b/moses/FF/SkeletonStatelessFF.cpp
@@ -1,12 +1,19 @@
+#include <vector>
#include "SkeletonStatelessFF.h"
#include "moses/ScoreComponentCollection.h"
-#include <vector>
+#include "moses/TargetPhrase.h"
using namespace std;
namespace Moses
{
-void SkeletonStatelessFF::Evaluate(const Phrase &source
+SkeletonStatelessFF::SkeletonStatelessFF(const std::string &line)
+ :StatelessFeatureFunction(2, line)
+{
+ ReadParameters();
+}
+
+void SkeletonStatelessFF::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
@@ -22,21 +29,37 @@ void SkeletonStatelessFF::Evaluate(const Phrase &source
}
-void SkeletonStatelessFF::Evaluate(const InputType &input
+void SkeletonStatelessFF::EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore) const
-{}
+{
+ if (targetPhrase.GetNumNonTerminals()) {
+ vector<float> newScores(m_numScoreComponents);
+ newScores[0] = - std::numeric_limits<float>::infinity();
+ scoreBreakdown.PlusEquals(this, newScores);
+ }
+
+}
-void SkeletonStatelessFF::Evaluate(const Hypothesis& hypo,
+void SkeletonStatelessFF::EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
-void SkeletonStatelessFF::EvaluateChart(const ChartHypothesis &hypo,
+void SkeletonStatelessFF::EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
+void SkeletonStatelessFF::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "arg") {
+ // set value here
+ } else {
+ StatelessFeatureFunction::SetParameter(key, value);
+ }
+}
+
}
diff --git a/moses/FF/SkeletonStatelessFF.h b/moses/FF/SkeletonStatelessFF.h
index 6aac207f4..520ec1405 100644
--- a/moses/FF/SkeletonStatelessFF.h
+++ b/moses/FF/SkeletonStatelessFF.h
@@ -9,29 +9,29 @@ namespace Moses
class SkeletonStatelessFF : public StatelessFeatureFunction
{
public:
- SkeletonStatelessFF(const std::string &line)
- :StatelessFeatureFunction(2, line)
- {}
+ SkeletonStatelessFF(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
return true;
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const;
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const;
+ void SetParameter(const std::string& key, const std::string& value);
+
};
}
diff --git a/moses/FF/SoftMatchingFeature.cpp b/moses/FF/SoftMatchingFeature.cpp
index 3e4e9db43..0475547da 100644
--- a/moses/FF/SoftMatchingFeature.cpp
+++ b/moses/FF/SoftMatchingFeature.cpp
@@ -24,8 +24,8 @@ void SoftMatchingFeature::SetParameter(const std::string& key, const std::string
m_tuneable = Scan<bool>(value);
} else if (key == "filterable") { //ignore
} else if (key == "path") {
- const std::string filePath = value;
- Load(filePath);
+ const std::string filePath = value;
+ Load(filePath);
} else {
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
}
@@ -35,34 +35,34 @@ void SoftMatchingFeature::SetParameter(const std::string& key, const std::string
bool SoftMatchingFeature::Load(const std::string& filePath)
{
- StaticData &staticData = StaticData::InstanceNonConst();
+ StaticData &staticData = StaticData::InstanceNonConst();
- InputFileStream inStream(filePath);
- std::string line;
- while(getline(inStream, line)) {
- std::vector<std::string> tokens = Tokenize(line);
- UTIL_THROW_IF2(tokens.size() != 2, "Error: wrong format of SoftMatching file: must have two nonterminals per line");
+ InputFileStream inStream(filePath);
+ std::string line;
+ while(getline(inStream, line)) {
+ std::vector<std::string> tokens = Tokenize(line);
+ UTIL_THROW_IF2(tokens.size() != 2, "Error: wrong format of SoftMatching file: must have two nonterminals per line");
- // no soft matching necessary if LHS and RHS are the same
- if (tokens[0] == tokens[1]) {
- continue;
- }
+ // no soft matching necessary if LHS and RHS are the same
+ if (tokens[0] == tokens[1]) {
+ continue;
+ }
- Word LHS, RHS;
- LHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[0], true);
- RHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[1], true);
+ Word LHS, RHS;
+ LHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[0], true);
+ RHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[1], true);
- m_softMatches[RHS[0]->GetId()].push_back(LHS);
- GetOrSetFeatureName(RHS, LHS);
- }
+ m_softMatches[RHS[0]->GetId()].push_back(LHS);
+ GetOrSetFeatureName(RHS, LHS);
+ }
- staticData.SetSoftMatches(m_softMatches);
+ staticData.SetSoftMatches(m_softMatches);
- return true;
+ return true;
}
-void SoftMatchingFeature::EvaluateChart(const ChartHypothesis& hypo,
- ScoreComponentCollection* accumulator) const
+void SoftMatchingFeature::EvaluateWhenApplied(const ChartHypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
{
const TargetPhrase& target = hypo.GetCurrTargetPhrase();
@@ -87,8 +87,7 @@ void SoftMatchingFeature::EvaluateChart(const ChartHypothesis& hypo,
}
// when loading, or when we notice that non-terminals have been added after loading, we resize vectors
-void SoftMatchingFeature::ResizeCache() const
-{
+void SoftMatchingFeature::ResizeCache() const {
FactorCollection& fc = FactorCollection::Instance();
size_t numNonTerminals = fc.GetNumNonTerminals();
@@ -99,8 +98,7 @@ void SoftMatchingFeature::ResizeCache() const
}
-const std::string& SoftMatchingFeature::GetOrSetFeatureName(const Word& RHS, const Word& LHS) const
-{
+const std::string& SoftMatchingFeature::GetOrSetFeatureName(const Word& RHS, const Word& LHS) const {
try {
#ifdef WITH_THREADS //try read-only lock
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
@@ -109,22 +107,23 @@ const std::string& SoftMatchingFeature::GetOrSetFeatureName(const Word& RHS, con
if (!name.empty()) {
return name;
}
- } catch (const std::out_of_range& oor) {
+ }
+ catch (const std::out_of_range& oor) {
#ifdef WITH_THREADS //need to resize cache; write lock
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
ResizeCache();
}
#ifdef WITH_THREADS //need to update cache; write lock
- boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
- std::string &name = m_nameCache[RHS[0]->GetId()][LHS[0]->GetId()];
- const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
- std::string LHS_string = LHS.GetString(outputFactorOrder, false);
- std::string RHS_string = RHS.GetString(outputFactorOrder, false);
- name = LHS_string + "->" + RHS_string;
- return name;
-}
+ std::string &name = m_nameCache[RHS[0]->GetId()][LHS[0]->GetId()];
+ const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
+ std::string LHS_string = LHS.GetString(outputFactorOrder, false);
+ std::string RHS_string = RHS.GetString(outputFactorOrder, false);
+ name = LHS_string + "->" + RHS_string;
+ return name;
+ }
}
diff --git a/moses/FF/SoftMatchingFeature.h b/moses/FF/SoftMatchingFeature.h
index b823c2426..ff923ea08 100644
--- a/moses/FF/SoftMatchingFeature.h
+++ b/moses/FF/SoftMatchingFeature.h
@@ -19,20 +19,20 @@ public:
return true;
}
- virtual void EvaluateChart(const ChartHypothesis& hypo,
+ virtual void EvaluateWhenApplied(const ChartHypothesis& hypo,
ScoreComponentCollection* accumulator) const;
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const {};
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const {};
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const {};
bool Load(const std::string &filePath);
diff --git a/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp b/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp
new file mode 100644
index 000000000..368e1597c
--- /dev/null
+++ b/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp
@@ -0,0 +1,564 @@
+#include <vector>
+#include <limits>
+#include <assert.h>
+#include "SoftSourceSyntacticConstraintsFeature.h"
+#include "moses/StaticData.h"
+#include "moses/InputFileStream.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/Hypothesis.h"
+#include "moses/ChartHypothesis.h"
+#include "moses/ChartManager.h"
+#include "moses/FactorCollection.h"
+#include "moses/TreeInput.h"
+#include "moses/PP/SourceLabelsPhraseProperty.h"
+
+
+using namespace std;
+
+namespace Moses
+{
+
+SoftSourceSyntacticConstraintsFeature::SoftSourceSyntacticConstraintsFeature(const std::string &line)
+ : StatelessFeatureFunction(3, line), m_featureVariant(0)
+{
+ VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
+ ReadParameters();
+ VERBOSE(1, " Done.");
+ VERBOSE(1, " Feature variant: " << m_featureVariant << "." << std::endl);
+}
+
+void SoftSourceSyntacticConstraintsFeature::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "sourceLabelSetFile") {
+ m_sourceLabelSetFile = value;
+ } else if (key == "coreSourceLabelSetFile") {
+ m_coreSourceLabelSetFile = value;
+ } else if (key == "targetSourceLeftHandSideJointCountFile") {
+ m_targetSourceLHSJointCountFile = value;
+ } else if (key == "featureVariant") {
+ m_featureVariant = Scan<size_t>(value); // 0: only dense features, 1: no mismatches (also set weights 1 0 0 and tuneable=false), 2: with sparse features, 3: with sparse features for core labels only
+ } else {
+ StatelessFeatureFunction::SetParameter(key, value);
+ }
+}
+
+
+void SoftSourceSyntacticConstraintsFeature::Load()
+{
+ // don't change the loading order!
+ LoadSourceLabelSet();
+ if (m_featureVariant == 3) {
+ LoadCoreSourceLabelSet();
+ }
+ if (!m_targetSourceLHSJointCountFile.empty()) {
+ LoadTargetSourceLeftHandSideJointCountFile();
+ }
+}
+
+void SoftSourceSyntacticConstraintsFeature::LoadSourceLabelSet()
+{
+ VERBOSE(2, GetScoreProducerDescription() << ": Loading source label set from file " << m_sourceLabelSetFile << std::endl);
+ InputFileStream inFile(m_sourceLabelSetFile);
+
+ FactorCollection &factorCollection = FactorCollection::Instance();
+
+ // read source label set
+ std::string line;
+ m_sourceLabels.clear();
+ m_sourceLabelsByIndex.clear();
+ m_sourceLabelIndexesByFactor.clear();
+ while (getline(inFile, line)) {
+ std::istringstream tokenizer(line);
+ std::string label;
+ size_t index;
+ try {
+ tokenizer >> label >> index;
+ } catch (const std::exception &e) {
+ UTIL_THROW2(GetScoreProducerDescription()
+ << ": Error reading source label set file " << m_sourceLabelSetFile << " .");
+ }
+ std::pair< boost::unordered_map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
+ UTIL_THROW_IF2(!inserted.second, GetScoreProducerDescription()
+ << ": Source label set file " << m_sourceLabelSetFile << " should contain each syntactic label only once.");
+
+ if (index >= m_sourceLabelsByIndex.size()) {
+ m_sourceLabelsByIndex.resize(index+1);
+ }
+ m_sourceLabelsByIndex[index] = label;
+ const Factor* sourceLabelFactor = factorCollection.AddFactor(label,true);
+ m_sourceLabelIndexesByFactor[sourceLabelFactor] = index;
+ }
+
+ inFile.Close();
+
+ std::list<std::string> specialLabels;
+ specialLabels.push_back("GlueTop");
+ specialLabels.push_back("GlueX");
+// specialLabels.push_back("XRHS");
+// specialLabels.push_back("XLHS");
+ for (std::list<std::string>::const_iterator iter=specialLabels.begin();
+ iter!=specialLabels.end(); ++iter) {
+ boost::unordered_map<std::string,size_t>::iterator found = m_sourceLabels.find(*iter);
+ UTIL_THROW_IF2(found == m_sourceLabels.end(), GetScoreProducerDescription()
+ << ": Source label set file " << m_sourceLabelSetFile << " should contain an entry for the special label \"" << *iter << "\".");
+ if (!(found->first).compare("GlueTop")) {
+ m_GlueTopLabel = found->second;
+// } else if (!(found->first).compare("XRHS")) {
+// m_XRHSLabel = found->second;
+// } else if (!(found->first).compare("XLHS")) {
+// m_XLHSLabel = found->second;
+ }
+ }
+}
+
+void SoftSourceSyntacticConstraintsFeature::LoadCoreSourceLabelSet()
+{
+ VERBOSE(2, GetScoreProducerDescription() << ": Loading core source label set from file " << m_coreSourceLabelSetFile << std::endl);
+ InputFileStream inFile(m_coreSourceLabelSetFile);
+
+ // read core source label set
+ std::string line;
+ m_coreSourceLabels.clear();
+ while (getline(inFile, line)) {
+ istringstream tokenizer(line);
+ std::string label;
+ tokenizer >> label;
+ boost::unordered_map<std::string,size_t>::iterator foundSourceLabelIndex = m_sourceLabels.find( label );
+ if ( foundSourceLabelIndex != m_sourceLabels.end() ) {
+ m_coreSourceLabels.insert(foundSourceLabelIndex->second);
+ } else {
+ VERBOSE(2, GetScoreProducerDescription()
+ << ": Ignoring unknown source label \"" << label << "\" "
+ << "from core source label set file " << m_coreSourceLabelSetFile << "."
+ << std::endl);
+ }
+ }
+
+ inFile.Close();
+}
+
+void SoftSourceSyntacticConstraintsFeature::LoadTargetSourceLeftHandSideJointCountFile()
+{
+
+ VERBOSE(2, GetScoreProducerDescription() << ": Loading target/source label joint counts from file " << m_targetSourceLHSJointCountFile << std::endl);
+ InputFileStream inFile(m_targetSourceLHSJointCountFile);
+
+ for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
+ iter!=m_labelPairProbabilities.end(); ++iter) {
+ delete iter->second;
+ }
+ m_labelPairProbabilities.clear();
+
+ // read joint counts
+ std::string line;
+ FactorCollection &factorCollection = FactorCollection::Instance();
+ boost::unordered_map<const Factor*,float> targetLHSCounts;
+ std::vector<float> sourceLHSCounts(m_sourceLabels.size(),0.0);
+
+ while (getline(inFile, line)) {
+ istringstream tokenizer(line);
+ std::string targetLabel;
+ std::string sourceLabel;
+ float count;
+ tokenizer >> targetLabel;
+ tokenizer >> sourceLabel;
+ tokenizer >> count;
+
+ boost::unordered_map<std::string,size_t>::iterator foundSourceLabelIndex = m_sourceLabels.find( sourceLabel );
+ UTIL_THROW_IF2(foundSourceLabelIndex == m_sourceLabels.end(), GetScoreProducerDescription()
+ << ": Target/source label joint count file " << m_targetSourceLHSJointCountFile
+ << " contains unknown source label \"" << sourceLabel << "\".");
+
+ const Factor* targetLabelFactor = factorCollection.AddFactor(targetLabel,true);
+
+ sourceLHSCounts[foundSourceLabelIndex->second] += count;
+ std::pair< boost::unordered_map<const Factor*,float >::iterator, bool > insertedTargetLHSCount =
+ targetLHSCounts.insert( std::pair<const Factor*,float>(targetLabelFactor,count) );
+ if (!insertedTargetLHSCount.second) {
+ (insertedTargetLHSCount.first)->second += count;
+ boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator jointCountIt =
+ m_labelPairProbabilities.find( targetLabelFactor );
+ assert(jointCountIt != m_labelPairProbabilities.end());
+ (jointCountIt->second)->at(foundSourceLabelIndex->second).first += count;
+ (jointCountIt->second)->at(foundSourceLabelIndex->second).second += count;
+ } else {
+ std::pair<float,float> init(0.0,0.0);
+ std::vector< std::pair<float,float> >* sourceVector = new std::vector< std::pair<float,float> >(m_sourceLabels.size(),init);
+ sourceVector->at(foundSourceLabelIndex->second) = std::pair<float,float>(count,count);
+ std::pair< boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator, bool > insertedJointCount =
+ m_labelPairProbabilities.insert( std::pair<const Factor*, std::vector< std::pair<float,float> >* >(targetLabelFactor,sourceVector) );
+ assert(insertedJointCount.second);
+ }
+ }
+
+ // normalization
+ for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
+ iter!=m_labelPairProbabilities.end(); ++iter) {
+ float targetLHSCount = 0;
+ boost::unordered_map<const Factor*,float >::const_iterator targetLHSCountIt = targetLHSCounts.find( iter->first );
+ if ( targetLHSCountIt != targetLHSCounts.end() ) {
+ targetLHSCount = targetLHSCountIt->second;
+ }
+ std::vector< std::pair<float,float> > &probabilities = *(iter->second);
+ for (size_t index=0; index<probabilities.size(); ++index) {
+
+ if ( probabilities[index].first != 0 ) {
+ assert(targetLHSCount != 0);
+ probabilities[index].first /= targetLHSCount;
+ }
+ if ( probabilities[index].second != 0 ) {
+ assert(sourceLHSCounts[index] != 0);
+ probabilities[index].second /= sourceLHSCounts[index];
+ }
+ }
+ }
+
+ inFile.Close();
+}
+
+
+void SoftSourceSyntacticConstraintsFeature::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
+{
+ assert(stackVec);
+
+ IFFEATUREVERBOSE(2)
+ {
+ FEATUREVERBOSE(2, targetPhrase << std::endl);
+ FEATUREVERBOSE(2, inputPath << std::endl);
+ for (size_t i = 0; i < stackVec->size(); ++i)
+ {
+ const ChartCellLabel &cell = *stackVec->at(i);
+ const WordsRange &ntRange = cell.GetCoverage();
+ FEATUREVERBOSE(2, "stackVec[ " << i << " ] : " << ntRange.GetStartPos() << " - " << ntRange.GetEndPos() << std::endl);
+ }
+
+ for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignTerm().begin();
+ it!=targetPhrase.GetAlignTerm().end(); ++it)
+ {
+ FEATUREVERBOSE(2, "alignTerm " << it->first << " " << it->second << std::endl);
+ }
+
+ for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin();
+ it!=targetPhrase.GetAlignNonTerm().end(); ++it)
+ {
+ FEATUREVERBOSE(2, "alignNonTerm " << it->first << " " << it->second << std::endl);
+ }
+ }
+
+ // dense scores
+ std::vector<float> newScores(m_numScoreComponents,0); // m_numScoreComponents == 3
+
+ const TreeInput& treeInput = static_cast<const TreeInput&>(input);
+ const StaticData& staticData = StaticData::Instance();
+ const Word& outputDefaultNonTerminal = staticData.GetOutputDefaultNonTerminal();
+
+ size_t nNTs = 1;
+ bool treeInputMismatchLHSBinary = true;
+ size_t treeInputMismatchRHSCount = 0;
+ bool hasCompleteTreeInputMatch = false;
+ float t2sLabelsProb = 1;
+ float s2tLabelsProb = 1;
+ float ruleLabelledProbability = 1;
+
+ // read SourceLabels property
+ const Factor* targetLHS = targetPhrase.GetTargetLHS()[0];
+ bool isGlueGrammarRule = false;
+ bool isUnkRule = false;
+
+ if (const PhraseProperty *property = targetPhrase.GetProperty("SourceLabels")) {
+
+ const SourceLabelsPhraseProperty *sourceLabelsPhraseProperty = static_cast<const SourceLabelsPhraseProperty*>(property);
+
+ nNTs = sourceLabelsPhraseProperty->GetNumberOfNonTerminals();
+ float totalCount = sourceLabelsPhraseProperty->GetTotalCount();
+
+ // prepare for input tree label matching
+ std::vector< boost::unordered_set<size_t> > treeInputLabelsRHS(nNTs-1);
+ boost::unordered_set<size_t> treeInputLabelsLHS;
+
+ // get index map for underlying hypotheses
+ const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
+ targetPhrase.GetAlignNonTerm().GetNonTermIndexMap();
+
+ std::vector<const Factor*> targetLabelsRHS;
+ if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
+ size_t nonTerminalNumber = 0;
+
+ for (size_t phrasePos=0; phrasePos<targetPhrase.GetSize(); ++phrasePos) {
+ // consult rule for either word or non-terminal
+ const Word &word = targetPhrase.GetWord(phrasePos);
+ if ( word.IsNonTerminal() ) {
+ // non-terminal: consult subderivation
+ size_t nonTermIndex = nonTermIndexMap[phrasePos];
+ targetLabelsRHS.push_back( word[0] );
+
+ // retrieve information that is required for input tree label matching (RHS)
+ const ChartCellLabel &cell = *stackVec->at(nonTermIndex);
+ const WordsRange& prevWordsRange = cell.GetCoverage();
+ size_t prevStartPos = prevWordsRange.GetStartPos();
+ size_t prevEndPos = prevWordsRange.GetEndPos();
+ const NonTerminalSet& prevTreeInputLabels = treeInput.GetLabelSet(prevStartPos,prevEndPos);
+
+ for (NonTerminalSet::const_iterator prevTreeInputLabelsIt = prevTreeInputLabels.begin();
+ prevTreeInputLabelsIt != prevTreeInputLabels.end(); ++prevTreeInputLabelsIt)
+ {
+ if (*prevTreeInputLabelsIt != outputDefaultNonTerminal)
+ {
+ boost::unordered_map<const Factor*,size_t>::const_iterator foundPrevTreeInputLabel
+ = m_sourceLabelIndexesByFactor.find((*prevTreeInputLabelsIt)[0]);
+ if (foundPrevTreeInputLabel != m_sourceLabelIndexesByFactor.end())
+ {
+ size_t prevTreeInputLabelIndex = foundPrevTreeInputLabel->second;
+ treeInputLabelsRHS[nonTerminalNumber].insert(prevTreeInputLabelIndex);
+ }
+ }
+ }
+
+ ++nonTerminalNumber;
+ }
+ }
+ }
+
+ // retrieve information that is required for input tree label matching (LHS)
+ const WordsRange& wordsRange = inputPath.GetWordsRange();
+ size_t startPos = wordsRange.GetStartPos();
+ size_t endPos = wordsRange.GetEndPos();
+ const NonTerminalSet& treeInputLabels = treeInput.GetLabelSet(startPos,endPos);
+
+ for (NonTerminalSet::const_iterator treeInputLabelsIt = treeInputLabels.begin();
+ treeInputLabelsIt != treeInputLabels.end(); ++treeInputLabelsIt) {
+ if (*treeInputLabelsIt != outputDefaultNonTerminal) {
+ boost::unordered_map<const Factor*,size_t>::const_iterator foundTreeInputLabel
+ = m_sourceLabelIndexesByFactor.find((*treeInputLabelsIt)[0]);
+ if (foundTreeInputLabel != m_sourceLabelIndexesByFactor.end()) {
+ size_t treeInputLabelIndex = foundTreeInputLabel->second;
+ treeInputLabelsLHS.insert(treeInputLabelIndex);
+ }
+ }
+ }
+
+
+ // inspect source-labelled rule items
+
+ std::vector< boost::unordered_set<size_t> > sparseScoredTreeInputLabelsRHS(nNTs-1);
+ boost::unordered_set<size_t> sparseScoredTreeInputLabelsLHS;
+
+ std::vector<bool> sourceLabelSeenAsLHS(m_sourceLabels.size(),false);
+ std::vector<bool> treeInputMatchRHSCountByNonTerminal(nNTs-1,false);
+
+ const std::list<SourceLabelsPhrasePropertyItem> &sourceLabelItems = sourceLabelsPhraseProperty->GetSourceLabelItems();
+
+ for (std::list<SourceLabelsPhrasePropertyItem>::const_iterator sourceLabelItem = sourceLabelItems.begin();
+ sourceLabelItem != sourceLabelItems.end() && !hasCompleteTreeInputMatch; ++sourceLabelItem) {
+
+ const std::list<size_t> &sourceLabelsRHS = sourceLabelItem->GetSourceLabelsRHS();
+ // float sourceLabelsRHSCount = sourceLabelItem->GetSourceLabelsRHSCount();
+ const std::list< std::pair<size_t,float> > &sourceLabelsLHSList = sourceLabelItem->GetSourceLabelsLHSList();
+
+ assert(sourceLabelsRHS.size() == nNTs-1);
+
+ bool currentSourceLabelItemIsCompleteTreeInputMatch = true;
+
+ size_t nonTerminalNumber=0;
+ for (std::list<size_t>::const_iterator sourceLabelsRHSIt = sourceLabelsRHS.begin();
+ sourceLabelsRHSIt != sourceLabelsRHS.end(); ++sourceLabelsRHSIt, ++nonTerminalNumber) {
+
+ if (treeInputLabelsRHS[nonTerminalNumber].find(*sourceLabelsRHSIt) != treeInputLabelsRHS[nonTerminalNumber].end()) {
+
+ treeInputMatchRHSCountByNonTerminal[nonTerminalNumber] = true;
+
+ if ( m_featureVariant == 2 ||
+ (m_featureVariant == 3 && m_coreSourceLabels.find(*sourceLabelsRHSIt) != m_coreSourceLabels.end()) ) {
+ // score sparse features: RHS match
+ if (sparseScoredTreeInputLabelsRHS[nonTerminalNumber].find(*sourceLabelsRHSIt) == sparseScoredTreeInputLabelsRHS[nonTerminalNumber].end()) {
+ // (only if no match has been scored for this tree input label and rule non-terminal with a previous sourceLabelItem)
+ float score_RHS_1 = (float)1/treeInputLabelsRHS[nonTerminalNumber].size();
+ scoreBreakdown.PlusEquals(this,
+ std::string("RHS_1_" + m_sourceLabelsByIndex[*sourceLabelsRHSIt]),
+ score_RHS_1);
+ sparseScoredTreeInputLabelsRHS[nonTerminalNumber].insert(*sourceLabelsRHSIt);
+ }
+ }
+
+ } else {
+
+ currentSourceLabelItemIsCompleteTreeInputMatch = false;
+
+ }
+ }
+
+ // LHS source non-terminal labels seen with this RHS
+ bool currentSourceLabelItemHasLHSTreeInputMatch = false;
+ //float ruleLabelledCount = 0;
+ std::list< std::pair<size_t,float> >::const_iterator sourceLabelsLHSIt;
+
+ for (sourceLabelsLHSIt = sourceLabelsLHSList.begin(); sourceLabelsLHSIt != sourceLabelsLHSList.end(); ++sourceLabelsLHSIt) {
+
+ if ( sourceLabelsLHSIt->first == m_GlueTopLabel ) {
+ isGlueGrammarRule = true;
+ }
+
+ if (treeInputLabelsLHS.find(sourceLabelsLHSIt->first) != treeInputLabelsLHS.end()) {
+
+ currentSourceLabelItemHasLHSTreeInputMatch = true;
+
+ if ( m_featureVariant == 2 ||
+ (m_featureVariant == 3 && m_coreSourceLabels.find(sourceLabelsLHSIt->first) != m_coreSourceLabels.end()) ) {
+ // score sparse features: LHS match
+ if (sparseScoredTreeInputLabelsLHS.find(sourceLabelsLHSIt->first) == sparseScoredTreeInputLabelsLHS.end()) {
+ // (only if no match has been scored for this tree input label and rule non-terminal with a previous sourceLabelItem)
+ float score_LHS_1 = (float)1/treeInputLabelsLHS.size();
+ scoreBreakdown.PlusEquals(this,
+ std::string("LHS_1_" + m_sourceLabelsByIndex[sourceLabelsLHSIt->first]),
+ score_LHS_1);
+ sparseScoredTreeInputLabelsLHS.insert(sourceLabelsLHSIt->first);
+ }
+ }
+ break;
+
+ }
+ }
+
+ if (currentSourceLabelItemHasLHSTreeInputMatch) {
+ // input tree matching (LHS)
+ treeInputMismatchLHSBinary = false;
+ } else {
+ currentSourceLabelItemIsCompleteTreeInputMatch = false;
+ }
+
+ if (currentSourceLabelItemIsCompleteTreeInputMatch) {
+ hasCompleteTreeInputMatch = true;
+
+ ruleLabelledProbability = sourceLabelsLHSIt->second / totalCount;
+ std::pair<float,float> probPair = GetLabelPairProbabilities( targetLHS, sourceLabelsLHSIt->first);
+ t2sLabelsProb = probPair.first;
+ s2tLabelsProb = probPair.second;
+ nonTerminalNumber=0;
+ for (std::list<size_t>::const_iterator sourceLabelsRHSIt = sourceLabelsRHS.begin();
+ sourceLabelsRHSIt != sourceLabelsRHS.end(); ++sourceLabelsRHSIt, ++nonTerminalNumber) {
+ probPair = GetLabelPairProbabilities( targetLabelsRHS[nonTerminalNumber], *sourceLabelsRHSIt );
+ t2sLabelsProb += probPair.first;
+ s2tLabelsProb += probPair.second;
+ }
+ t2sLabelsProb /= nNTs;
+ s2tLabelsProb /= nNTs;
+ assert(t2sLabelsProb != 0);
+ assert(s2tLabelsProb != 0);
+ }
+
+ }
+
+ // input tree matching (RHS)
+ if ( !hasCompleteTreeInputMatch ) {
+ treeInputMismatchRHSCount = nNTs-1;
+ for (std::vector<bool>::const_iterator treeInputMatchRHSCountByNonTerminalIt = treeInputMatchRHSCountByNonTerminal.begin();
+ treeInputMatchRHSCountByNonTerminalIt != treeInputMatchRHSCountByNonTerminal.end(); ++treeInputMatchRHSCountByNonTerminalIt) {
+ if (*treeInputMatchRHSCountByNonTerminalIt) {
+ --treeInputMismatchRHSCount;
+ }
+ }
+ }
+
+ // score sparse features: mismatches
+ if ( m_featureVariant == 2 || m_featureVariant == 3 ) {
+
+ // RHS
+
+ for (size_t nonTerminalNumber = 0; nonTerminalNumber < nNTs-1; ++nonTerminalNumber) {
+ // nNTs-1 because nNTs also counts the left-hand side non-terminal
+
+ float score_RHS_0 = (float)1/treeInputLabelsRHS[nonTerminalNumber].size();
+ for (boost::unordered_set<size_t>::const_iterator treeInputLabelsRHSIt = treeInputLabelsRHS[nonTerminalNumber].begin();
+ treeInputLabelsRHSIt != treeInputLabelsRHS[nonTerminalNumber].end(); ++treeInputLabelsRHSIt) {
+
+ if ( m_featureVariant == 2 ||
+ (m_featureVariant == 3 && m_coreSourceLabels.find(*treeInputLabelsRHSIt) != m_coreSourceLabels.end()) ) {
+
+ if (sparseScoredTreeInputLabelsRHS[nonTerminalNumber].find(*treeInputLabelsRHSIt) == sparseScoredTreeInputLabelsRHS[nonTerminalNumber].end()) {
+ // score sparse features: RHS mismatch
+ scoreBreakdown.PlusEquals(this,
+ std::string("RHS_0_" + m_sourceLabelsByIndex[*treeInputLabelsRHSIt]),
+ score_RHS_0);
+ }
+ }
+ }
+ }
+
+ // LHS
+
+ float score_LHS_0 = (float)1/treeInputLabelsLHS.size();
+ for (boost::unordered_set<size_t>::const_iterator treeInputLabelsLHSIt = treeInputLabelsLHS.begin();
+ treeInputLabelsLHSIt != treeInputLabelsLHS.end(); ++treeInputLabelsLHSIt) {
+
+ if ( m_featureVariant == 2 ||
+ (m_featureVariant == 3 && m_coreSourceLabels.find(*treeInputLabelsLHSIt) != m_coreSourceLabels.end()) ) {
+
+ if (sparseScoredTreeInputLabelsLHS.find(*treeInputLabelsLHSIt) == sparseScoredTreeInputLabelsLHS.end()) {
+ // score sparse features: RHS mismatch
+ scoreBreakdown.PlusEquals(this,
+ std::string("LHS_0_" + m_sourceLabelsByIndex[*treeInputLabelsLHSIt]),
+ score_LHS_0);
+ }
+ }
+ }
+
+ }
+
+ } else {
+
+ // abort with error message if the phrase does not translate an unknown word
+ UTIL_THROW_IF2(!targetPhrase.GetWord(0).IsOOV(), GetScoreProducerDescription()
+ << ": Missing SourceLabels property. "
+ << "Please check phrase table and glue rules.");
+
+ // unknown word
+ isUnkRule = true;
+
+ }
+
+ // add scores
+
+ // input tree matching
+ switch (m_featureVariant) {
+
+ case 0:
+ newScores[0] = hasCompleteTreeInputMatch;
+ break;
+
+ case 1:
+ newScores[0] = ( (hasCompleteTreeInputMatch || isGlueGrammarRule || isUnkRule) ? 0 : std::numeric_limits<float>::min() );
+ break;
+
+ default:
+ newScores[0] = hasCompleteTreeInputMatch;
+ }
+ newScores[1] = treeInputMismatchLHSBinary;
+ newScores[2] = treeInputMismatchRHSCount;
+// newScores[3] = hasCompleteTreeInputMatch ? std::log(t2sLabelsProb) : 0;
+// newScores[4] = hasCompleteTreeInputMatch ? std::log(s2tLabelsProb) : 0;
+// newScores[3] = hasCompleteTreeInputMatch ? std::log(ruleLabelledProbability) : 0;
+
+ scoreBreakdown.PlusEquals(this, newScores);
+}
+
+
+std::pair<float,float> SoftSourceSyntacticConstraintsFeature::GetLabelPairProbabilities(
+ const Factor* target,
+ const size_t source) const
+{
+ boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::const_iterator found =
+ m_labelPairProbabilities.find(target);
+ if ( found == m_labelPairProbabilities.end() ) {
+ return std::pair<float,float>(0,0);
+ }
+ return found->second->at(source);
+}
+
+
+}
+
diff --git a/moses/FF/SoftSourceSyntacticConstraintsFeature.h b/moses/FF/SoftSourceSyntacticConstraintsFeature.h
new file mode 100644
index 000000000..6bcb23303
--- /dev/null
+++ b/moses/FF/SoftSourceSyntacticConstraintsFeature.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <string>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include "StatelessFeatureFunction.h"
+#include "FFState.h"
+#include "moses/Factor.h"
+
+namespace Moses
+{
+
+
+class SoftSourceSyntacticConstraintsFeature : public StatelessFeatureFunction
+{
+public:
+ SoftSourceSyntacticConstraintsFeature(const std::string &line);
+
+ ~SoftSourceSyntacticConstraintsFeature() {
+ for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
+ iter!=m_labelPairProbabilities.end(); ++iter) {
+ delete iter->second;
+ }
+ }
+
+ bool IsUseable(const FactorMask &mask) const {
+ return true;
+ }
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+ {
+ targetPhrase.SetRuleSource(source);
+ };
+
+ void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+ void EvaluateWhenApplied(
+ const Hypothesis& cur_hypo,
+ ScoreComponentCollection* accumulator) const
+ {};
+
+ void EvaluateWhenApplied(
+ const ChartHypothesis& cur_hypo,
+ ScoreComponentCollection* accumulator) const
+ {};
+
+private:
+ std::string m_sourceLabelSetFile;
+ std::string m_coreSourceLabelSetFile;
+ std::string m_targetSourceLHSJointCountFile;
+ std::string m_unknownLeftHandSideFile;
+ size_t m_featureVariant;
+
+ boost::unordered_map<std::string,size_t> m_sourceLabels;
+ std::vector<std::string> m_sourceLabelsByIndex;
+ boost::unordered_set<size_t> m_coreSourceLabels;
+ boost::unordered_map<const Factor*,size_t> m_sourceLabelIndexesByFactor;
+ size_t m_GlueTopLabel;
+// mutable size_t m_XRHSLabel;
+// mutable size_t m_XLHSLabel;
+
+ boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* > m_labelPairProbabilities;
+ boost::unordered_map<size_t,float> m_unknownLHSProbabilities;
+ float m_smoothingWeight;
+ float m_unseenLHSSmoothingFactorForUnknowns;
+
+ void Load();
+ void LoadSourceLabelSet();
+ void LoadCoreSourceLabelSet();
+ void LoadTargetSourceLeftHandSideJointCountFile();
+
+ std::pair<float,float> GetLabelPairProbabilities(const Factor* target,
+ const size_t source) const;
+
+};
+
+
+}
+
diff --git a/moses/FF/SourceGHKMTreeInputMatchFeature.cpp b/moses/FF/SourceGHKMTreeInputMatchFeature.cpp
new file mode 100644
index 000000000..38238b10c
--- /dev/null
+++ b/moses/FF/SourceGHKMTreeInputMatchFeature.cpp
@@ -0,0 +1,67 @@
+#include <map>
+#include <vector>
+#include <assert.h>
+#include "SourceGHKMTreeInputMatchFeature.h"
+#include "moses/StaticData.h"
+#include "moses/InputFileStream.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/Hypothesis.h"
+#include "moses/ChartHypothesis.h"
+#include "moses/Factor.h"
+#include "moses/FactorCollection.h"
+#include "moses/InputPath.h"
+#include "moses/TreeInput.h"
+
+
+using namespace std;
+
+namespace Moses
+{
+
+SourceGHKMTreeInputMatchFeature::SourceGHKMTreeInputMatchFeature(const std::string &line)
+ : StatelessFeatureFunction(2, line)
+{
+ std::cerr << GetScoreProducerDescription() << "Initializing feature...";
+ ReadParameters();
+ std::cerr << " Done." << std::endl;
+}
+
+void SourceGHKMTreeInputMatchFeature::SetParameter(const std::string& key, const std::string& value)
+{
+ UTIL_THROW(util::Exception, GetScoreProducerDescription() << ": Unknown parameter " << key << "=" << value);
+}
+
+// assumes that source-side syntax labels are stored in the target non-terminal field of the rules
+void SourceGHKMTreeInputMatchFeature::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
+{
+ const WordsRange& wordsRange = inputPath.GetWordsRange();
+ size_t startPos = wordsRange.GetStartPos();
+ size_t endPos = wordsRange.GetEndPos();
+ const TreeInput& treeInput = static_cast<const TreeInput&>(input);
+ const NonTerminalSet& treeInputLabels = treeInput.GetLabelSet(startPos,endPos);
+ const Word& lhsLabel = targetPhrase.GetTargetLHS();
+
+ const StaticData& staticData = StaticData::Instance();
+ const Word& outputDefaultNonTerminal = staticData.GetOutputDefaultNonTerminal();
+
+ std::vector<float> newScores(m_numScoreComponents,0.0); // m_numScoreComponents == 2 // first fires for matches, second for mismatches
+
+ if ( (treeInputLabels.find(lhsLabel) != treeInputLabels.end()) && (lhsLabel != outputDefaultNonTerminal) ) {
+ // match
+ newScores[0] = 1.0;
+ } else {
+ // mismatch
+ newScores[1] = 1.0;
+ }
+
+ scoreBreakdown.PlusEquals(this, newScores);
+}
+
+
+}
+
diff --git a/moses/FF/SourceGHKMTreeInputMatchFeature.h b/moses/FF/SourceGHKMTreeInputMatchFeature.h
new file mode 100644
index 000000000..743871b1c
--- /dev/null
+++ b/moses/FF/SourceGHKMTreeInputMatchFeature.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "StatelessFeatureFunction.h"
+
+namespace Moses
+{
+
+// assumes that source-side syntax labels are stored in the target non-terminal field of the rules
+class SourceGHKMTreeInputMatchFeature : public StatelessFeatureFunction
+{
+public:
+ SourceGHKMTreeInputMatchFeature(const std::string &line);
+
+ bool IsUseable(const FactorMask &mask) const {
+ return true;
+ }
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const {};
+
+ void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+ void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const {};
+
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const {};
+
+};
+
+
+}
+
diff --git a/moses/FF/SourceWordDeletionFeature.cpp b/moses/FF/SourceWordDeletionFeature.cpp
index 101e40579..dcbba6a0a 100644
--- a/moses/FF/SourceWordDeletionFeature.cpp
+++ b/moses/FF/SourceWordDeletionFeature.cpp
@@ -63,7 +63,7 @@ bool SourceWordDeletionFeature::IsUseable(const FactorMask &mask) const
return ret;
}
-void SourceWordDeletionFeature::Evaluate(const Phrase &source
+void SourceWordDeletionFeature::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
@@ -83,10 +83,7 @@ void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source,
if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
// flag aligned words
- bool aligned[16];
- UTIL_THROW_IF2(sourceLength >= 16, "Source length must be less than 16 words");
- for(size_t i=0; i<sourceLength; i++)
- aligned[i] = false;
+ std::vector<bool> aligned(sourceLength, false);
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++)
aligned[ alignmentPoint->first ] = true;
diff --git a/moses/FF/SourceWordDeletionFeature.h b/moses/FF/SourceWordDeletionFeature.h
index 9b04476af..8211ef0ca 100644
--- a/moses/FF/SourceWordDeletionFeature.h
+++ b/moses/FF/SourceWordDeletionFeature.h
@@ -28,21 +28,21 @@ public:
bool IsUseable(const FactorMask &mask) const;
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
diff --git a/moses/FF/SpanLength.cpp b/moses/FF/SpanLength.cpp
new file mode 100644
index 000000000..7a7c87be8
--- /dev/null
+++ b/moses/FF/SpanLength.cpp
@@ -0,0 +1,93 @@
+#include <boost/shared_ptr.hpp>
+#include "SpanLength.h"
+#include "moses/StaticData.h"
+#include "moses/Word.h"
+#include "moses/ChartCellLabel.h"
+#include "moses/WordsRange.h"
+#include "moses/StackVec.h"
+#include "moses/TargetPhrase.h"
+#include "moses/PP/PhraseProperty.h"
+#include "moses/PP/SpanLengthPhraseProperty.h"
+
+using namespace std;
+
+namespace Moses
+{
+SpanLength::SpanLength(const std::string &line)
+:StatelessFeatureFunction(1, line)
+,m_smoothingMethod(None)
+,m_const(0)
+{
+ ReadParameters();
+}
+
+void SpanLength::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+{
+ targetPhrase.SetRuleSource(source);
+}
+
+void SpanLength::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
+{
+ assert(stackVec);
+
+ const PhraseProperty *property = targetPhrase.GetProperty("SpanLength");
+ if (property == NULL) {
+ return;
+ }
+
+ const SpanLengthPhraseProperty *slProp = static_cast<const SpanLengthPhraseProperty*>(property);
+
+ const Phrase *ruleSource = targetPhrase.GetRuleSource();
+ assert(ruleSource);
+
+ float score = 0;
+ for (size_t i = 0; i < stackVec->size(); ++i) {
+ const ChartCellLabel &cell = *stackVec->at(i);
+ const WordsRange &ntRange = cell.GetCoverage();
+ size_t sourceWidth = ntRange.GetNumWordsCovered();
+ float prob = slProp->GetProb(i, sourceWidth, m_const);
+ score += TransformScore(prob);
+ }
+
+ if (score < -100.0f) {
+ float weight = StaticData::Instance().GetWeight(this);
+ if (weight < 0) {
+ score = -100;
+ }
+ }
+
+ scoreBreakdown.PlusEquals(this, score);
+
+}
+
+void SpanLength::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "smoothing") {
+ if (value == "plus-constant") {
+ m_smoothingMethod = PlusConst;
+ }
+ else if (value == "none") {
+ m_smoothingMethod = None;
+ }
+ else {
+ UTIL_THROW(util::Exception, "Unknown smoothing type " << value);
+ }
+ }
+ else if (key == "constant") {
+ m_const = Scan<float>(value);
+ }
+ else {
+ StatelessFeatureFunction::SetParameter(key, value);
+ }
+}
+
+}
+
diff --git a/moses/FF/SpanLength.h b/moses/FF/SpanLength.h
new file mode 100644
index 000000000..dc5564fcd
--- /dev/null
+++ b/moses/FF/SpanLength.h
@@ -0,0 +1,52 @@
+#pragma once
+#include <string>
+#include "StatelessFeatureFunction.h"
+
+namespace Moses
+{
+
+// Rule Scope - not quite completely implemented yet
+class SpanLength : public StatelessFeatureFunction
+{
+public:
+ SpanLength(const std::string &line);
+
+ virtual bool IsUseable(const FactorMask &mask) const
+ { return true; }
+
+ virtual void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
+
+ virtual void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+
+ virtual void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+protected:
+ enum SmoothingMethod
+ {
+ None,
+ PlusConst,
+ };
+ SmoothingMethod m_smoothingMethod;
+
+ float m_const;
+};
+
+}
+
diff --git a/moses/FF/SparseHieroReorderingFeature.cpp b/moses/FF/SparseHieroReorderingFeature.cpp
new file mode 100644
index 000000000..0c6ac4767
--- /dev/null
+++ b/moses/FF/SparseHieroReorderingFeature.cpp
@@ -0,0 +1,222 @@
+#include <iostream>
+
+#include "moses/ChartHypothesis.h"
+#include "moses/ChartManager.h"
+#include "moses/FactorCollection.h"
+#include "moses/Sentence.h"
+
+#include "util/exception.hh"
+
+#include "SparseHieroReorderingFeature.h"
+
+using namespace std;
+
+namespace Moses
+{
+
+SparseHieroReorderingFeature::SparseHieroReorderingFeature(const std::string &line)
+ :StatelessFeatureFunction(0, line),
+ m_type(SourceCombined),
+ m_sourceFactor(0),
+ m_targetFactor(0),
+ m_sourceVocabFile(""),
+ m_targetVocabFile("")
+{
+
+ /*
+ Configuration of features.
+ factor - Which factor should it apply to
+ type - what type of sparse reordering feature. e.g. block (modelled on Matthias
+ Huck's EAMT 2012 features)
+ word - which words to include, e.g. src_bdry, src_all, tgt_bdry , ...
+ vocab - vocab file to limit it to
+ orientation - e.g. lr, etc.
+ */
+ cerr << "Constructing a Sparse Reordering feature" << endl;
+ ReadParameters();
+ m_otherFactor = FactorCollection::Instance().AddFactor("##OTHER##");
+ LoadVocabulary(m_sourceVocabFile, m_sourceVocab);
+ LoadVocabulary(m_targetVocabFile, m_targetVocab);
+}
+
+void SparseHieroReorderingFeature::SetParameter(const std::string& key, const std::string& value) {
+ if (key == "input-factor") {
+ m_sourceFactor = Scan<FactorType>(value);
+ } else if (key == "output-factor") {
+ m_targetFactor = Scan<FactorType>(value);
+ } else if (key == "input-vocab-file") {
+ m_sourceVocabFile = value;
+ } else if (key == "output-vocab-file") {
+ m_targetVocabFile = value;
+ } else if (key == "type") {
+ if (value == "SourceCombined") {
+ m_type = SourceCombined;
+ } else if (value == "SourceLeft") {
+ m_type = SourceLeft;
+ } else if (value == "SourceRight") {
+ m_type = SourceRight;
+ } else {
+ UTIL_THROW(util::Exception, "Unknown sparse reordering type " << value);
+ }
+ } else {
+ FeatureFunction::SetParameter(key, value);
+ }
+}
+
+void SparseHieroReorderingFeature::LoadVocabulary(const std::string& filename, Vocab& vocab)
+{
+ if (filename.empty()) return;
+ ifstream in(filename.c_str());
+ UTIL_THROW_IF(!in, util::Exception, "Unable to open vocab file: " << filename);
+ string line;
+ while(getline(in,line)) {
+ vocab.insert(FactorCollection::Instance().AddFactor(line));
+ }
+ in.close();
+}
+
+const Factor* SparseHieroReorderingFeature::GetFactor(const Word& word, const Vocab& vocab, FactorType factorType) const {
+ const Factor* factor = word.GetFactor(factorType);
+ if (vocab.size() && vocab.find(factor) == vocab.end()) return m_otherFactor;
+ return factor;
+}
+
+void SparseHieroReorderingFeature::EvaluateWhenApplied(
+ const ChartHypothesis& cur_hypo ,
+ ScoreComponentCollection* accumulator) const
+{
+ // get index map for underlying hypotheses
+ //const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
+ // cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap();
+
+ //The Huck features. For a rule with source side:
+ // abXcdXef
+ //We first have to split into blocks:
+ // ab X cd X ef
+ //Then we extract features based in the boundary words of the neighbouring blocks
+ //For the block pair, we use the right word of the left block, and the left
+ //word of the right block.
+
+ //Need to get blocks, and their alignment. Each block has a word range (on the
+ // on the source), a non-terminal flag, and a set of alignment points in the target phrase
+
+ //We need to be able to map source word position to target word position, as
+ //much as possible (don't need interior of non-terminals). The alignment info
+ //objects just give us the mappings between *rule* positions. So if we can
+ //map source word position to source rule position, and target rule position
+ //to target word position, then we can map right through.
+
+ size_t sourceStart = cur_hypo.GetCurrSourceRange().GetStartPos();
+ size_t sourceSize = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();
+
+ vector<WordsRange> sourceNTSpans;
+ for (size_t prevHypoId = 0; prevHypoId < cur_hypo.GetPrevHypos().size(); ++prevHypoId) {
+ sourceNTSpans.push_back(cur_hypo.GetPrevHypo(prevHypoId)->GetCurrSourceRange());
+ }
+ //put in source order. Is this necessary?
+ sort(sourceNTSpans.begin(), sourceNTSpans.end());
+ //cerr << "Source NTs: ";
+ //for (size_t i = 0; i < sourceNTSpans.size(); ++i) cerr << sourceNTSpans[i] << " ";
+ //cerr << endl;
+
+ typedef pair<WordsRange,bool> Block;//flag indicates NT
+ vector<Block> sourceBlocks;
+ sourceBlocks.push_back(Block(cur_hypo.GetCurrSourceRange(),false));
+ for (vector<WordsRange>::const_iterator i = sourceNTSpans.begin();
+ i != sourceNTSpans.end(); ++i) {
+ const WordsRange& prevHypoRange = *i;
+ Block lastBlock = sourceBlocks.back();
+ sourceBlocks.pop_back();
+ //split this range into before NT, NT and after NT
+ if (prevHypoRange.GetStartPos() > lastBlock.first.GetStartPos()) {
+ sourceBlocks.push_back(Block(WordsRange(lastBlock.first.GetStartPos(),prevHypoRange.GetStartPos()-1),false));
+ }
+ sourceBlocks.push_back(Block(prevHypoRange,true));
+ if (prevHypoRange.GetEndPos() < lastBlock.first.GetEndPos()) {
+ sourceBlocks.push_back(Block(WordsRange(prevHypoRange.GetEndPos()+1,lastBlock.first.GetEndPos()), false));
+ }
+ }
+ /*
+ cerr << "Source Blocks: ";
+ for (size_t i = 0; i < sourceBlocks.size(); ++i) cerr << sourceBlocks[i].first << " "
+ << (sourceBlocks[i].second ? "NT" : "T") << " ";
+ cerr << endl;
+ */
+
+ //Mapping from source word to target rule position
+ vector<size_t> sourceWordToTargetRulePos(sourceSize);
+ map<size_t,size_t> alignMap;
+ alignMap.insert(
+ cur_hypo.GetCurrTargetPhrase().GetAlignTerm().begin(),
+ cur_hypo.GetCurrTargetPhrase().GetAlignTerm().end());
+ alignMap.insert(
+ cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().begin(),
+ cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().end());
+ //vector<size_t> alignMapTerm = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm()
+ size_t sourceRulePos = 0;
+ //cerr << "SW->RP ";
+ for (vector<Block>::const_iterator sourceBlockIt = sourceBlocks.begin();
+ sourceBlockIt != sourceBlocks.end(); ++sourceBlockIt) {
+ for (size_t sourceWordPos = sourceBlockIt->first.GetStartPos();
+ sourceWordPos <= sourceBlockIt->first.GetEndPos(); ++sourceWordPos) {
+ sourceWordToTargetRulePos[sourceWordPos - sourceStart] = alignMap[sourceRulePos];
+ // cerr << sourceWordPos - sourceStart << "-" << alignMap[sourceRulePos] << " ";
+ if (! sourceBlockIt->second) {
+ //T
+ ++sourceRulePos;
+ }
+ }
+ if ( sourceBlockIt->second) {
+ //NT
+ ++sourceRulePos;
+ }
+ }
+ //cerr << endl;
+
+ //Iterate through block pairs
+ const Sentence& sentence =
+ dynamic_cast<const Sentence&>(cur_hypo.GetManager().GetSource());
+ //const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
+ for (size_t i = 0; i < sourceBlocks.size()-1; ++i) {
+ Block& leftSourceBlock = sourceBlocks[i];
+ Block& rightSourceBlock = sourceBlocks[i+1];
+ size_t sourceLeftBoundaryPos = leftSourceBlock.first.GetEndPos();
+ size_t sourceRightBoundaryPos = rightSourceBlock.first.GetStartPos();
+ const Word& sourceLeftBoundaryWord = sentence.GetWord(sourceLeftBoundaryPos);
+ const Word& sourceRightBoundaryWord = sentence.GetWord(sourceRightBoundaryPos);
+ sourceLeftBoundaryPos -= sourceStart;
+ sourceRightBoundaryPos -= sourceStart;
+
+ // Need to figure out where these map to on the target.
+ size_t targetLeftRulePos =
+ sourceWordToTargetRulePos[sourceLeftBoundaryPos];
+ size_t targetRightRulePos =
+ sourceWordToTargetRulePos[sourceRightBoundaryPos];
+
+ bool isMonotone = true;
+ if ((sourceLeftBoundaryPos < sourceRightBoundaryPos &&
+ targetLeftRulePos > targetRightRulePos) ||
+ ((sourceLeftBoundaryPos > sourceRightBoundaryPos &&
+ targetLeftRulePos < targetRightRulePos)))
+ {
+ isMonotone = false;
+ }
+ stringstream buf;
+ buf << "h_"; //sparse reordering, Huck
+ if (m_type == SourceLeft || m_type == SourceCombined) {
+ buf << GetFactor(sourceLeftBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString();
+ buf << "_";
+ }
+ if (m_type == SourceRight || m_type == SourceCombined) {
+ buf << GetFactor(sourceRightBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString();
+ buf << "_";
+ }
+ buf << (isMonotone ? "M" : "S");
+ accumulator->PlusEquals(this,buf.str(), 1);
+ }
+// cerr << endl;
+}
+
+
+}
+
diff --git a/moses/FF/SparseHieroReorderingFeature.h b/moses/FF/SparseHieroReorderingFeature.h
new file mode 100644
index 000000000..d631fdec1
--- /dev/null
+++ b/moses/FF/SparseHieroReorderingFeature.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <string>
+
+#include <boost/unordered_set.hpp>
+
+#include <util/string_piece.hh>
+
+#include "moses/Factor.h"
+#include "moses/Sentence.h"
+
+#include "StatelessFeatureFunction.h"
+#include "FFState.h"
+
+namespace Moses
+{
+
+class SparseHieroReorderingFeature : public StatelessFeatureFunction
+{
+public:
+ enum Type {
+ SourceCombined,
+ SourceLeft,
+ SourceRight
+ };
+
+ SparseHieroReorderingFeature(const std::string &line);
+
+ bool IsUseable(const FactorMask &mask) const
+ { return true; }
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+ {}
+ virtual void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const
+ {}
+
+ virtual void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const;
+
+
+private:
+
+ typedef boost::unordered_set<const Factor*> Vocab;
+
+ void AddNonTerminalPairFeatures(
+ const Sentence& sentence, const WordsRange& nt1, const WordsRange& nt2,
+ bool isMonotone, ScoreComponentCollection* accumulator) const;
+
+ void LoadVocabulary(const std::string& filename, Vocab& vocab);
+ const Factor* GetFactor(const Word& word, const Vocab& vocab, FactorType factor) const;
+
+ Type m_type;
+ FactorType m_sourceFactor;
+ FactorType m_targetFactor;
+ std::string m_sourceVocabFile;
+ std::string m_targetVocabFile;
+
+ const Factor* m_otherFactor;
+
+ Vocab m_sourceVocab;
+ Vocab m_targetVocab;
+
+};
+
+
+}
+
diff --git a/moses-chart-cmd/mbr.h b/moses/FF/SparseHieroReorderingFeatureTest.cpp
index cd40a13b1..f05355df9 100644
--- a/moses-chart-cmd/mbr.h
+++ b/moses/FF/SparseHieroReorderingFeatureTest.cpp
@@ -1,8 +1,6 @@
-// $Id$
-
/***********************************************************************
Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
+Copyright (C) 2013- University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -18,16 +16,21 @@ You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
+#include <iostream>
+
+#include <boost/test/unit_test.hpp>
+
+#include "SparseHieroReorderingFeature.h"
+
+using namespace Moses;
+using namespace std;
-#pragma once
+BOOST_AUTO_TEST_SUITE(shrf)
-#include <vector>
-#include <map>
-#include "moses/TrellisPathList.h"
-#include "moses/TrellisPath.h"
-#include "moses/Factor.h"
+BOOST_AUTO_TEST_CASE(lexical_rule)
+{
+ SparseHieroReorderingFeature feature("name=shrf");
-std::vector<const Moses::Factor*> doMBR(const Moses::TrellisPathList& nBestList);
-void GetOutputFactors(const Moses::TrellisPath &path, std::vector <const Moses::Factor*> &translation);
-float calculate_score(const std::vector< std::vector<const Moses::Factor*> > & sents, int ref, int hyp, std::vector < std::map < std::vector < const Moses::Factor *>, int > > & ngram_stats );
+}
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/moses/FF/StatefulFeatureFunction.h b/moses/FF/StatefulFeatureFunction.h
index 75b46d827..950b122e9 100644
--- a/moses/FF/StatefulFeatureFunction.h
+++ b/moses/FF/StatefulFeatureFunction.h
@@ -2,6 +2,8 @@
#include "FeatureFunction.h"
+#include "moses/Syntax/SHyperedge.h"
+
namespace Moses
{
class FFState;
@@ -29,16 +31,21 @@ public:
* hypothesis, you should store it in an FFState object which will be passed
* in as prev_state. If you don't do this, you will get in trouble.
*/
- virtual FFState* Evaluate(
+ virtual FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const = 0;
- virtual FFState* EvaluateChart(
+ virtual FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const = 0;
+ virtual FFState* EvaluateWhenApplied(
+ const Syntax::SHyperedge& /* cur_hypo */,
+ int /* featureID - used to index the state in the previous hypotheses */,
+ ScoreComponentCollection* accumulator) const { assert(false); return 0; /* FIXME */ }
+
//! return the state associated with the empty hypothesis for a given sentence
virtual const FFState* EmptyHypothesisState(const InputType &input) const = 0;
diff --git a/moses/FF/StatelessFeatureFunction.h b/moses/FF/StatelessFeatureFunction.h
index fde740115..9ef5d269a 100644
--- a/moses/FF/StatelessFeatureFunction.h
+++ b/moses/FF/StatelessFeatureFunction.h
@@ -2,6 +2,8 @@
#include "FeatureFunction.h"
+#include "moses/Syntax/SHyperedge.h"
+
namespace Moses
{
@@ -20,18 +22,22 @@ public:
StatelessFeatureFunction(const std::string &line);
StatelessFeatureFunction(size_t numScoreComponents, const std::string &line);
+
/**
* This should be implemented for features that apply to phrase-based models.
**/
- virtual void Evaluate(const Hypothesis& hypo,
+ virtual void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const = 0;
/**
* Same for chart-based features.
**/
- virtual void EvaluateChart(const ChartHypothesis &hypo,
+ virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const = 0;
+ virtual void EvaluateWhenApplied(const Syntax::SHyperedge &,
+ ScoreComponentCollection*) const { assert(false); }
+
virtual bool IsStateless() const {
return true;
}
diff --git a/moses/FF/SyntaxRHS.cpp b/moses/FF/SyntaxRHS.cpp
new file mode 100644
index 000000000..24b3bf062
--- /dev/null
+++ b/moses/FF/SyntaxRHS.cpp
@@ -0,0 +1,46 @@
+#include <vector>
+#include "SyntaxRHS.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/TargetPhrase.h"
+#include "moses/StackVec.h"
+
+using namespace std;
+
+namespace Moses
+{
+SyntaxRHS::SyntaxRHS(const std::string &line)
+:StatelessFeatureFunction(1, line)
+{
+ ReadParameters();
+}
+
+void SyntaxRHS::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+{
+}
+
+void SyntaxRHS::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
+{
+ assert(stackVec);
+ for (size_t i = 0; i < stackVec->size(); ++i) {
+ const ChartCellLabel &cell = *stackVec->at(i);
+
+ }
+
+ if (targetPhrase.GetNumNonTerminals()) {
+ vector<float> newScores(m_numScoreComponents);
+ newScores[0] = - std::numeric_limits<float>::infinity();
+ scoreBreakdown.PlusEquals(this, newScores);
+ }
+
+}
+
+}
+
diff --git a/moses/FF/InternalStructStatelessFF.h b/moses/FF/SyntaxRHS.h
index 274982dce..4413aef72 100644
--- a/moses/FF/InternalStructStatelessFF.h
+++ b/moses/FF/SyntaxRHS.h
@@ -6,34 +6,33 @@
namespace Moses
{
-class InternalStructStatelessFF : public StatelessFeatureFunction
+class SyntaxRHS : public StatelessFeatureFunction
{
public:
- InternalStructStatelessFF(const std::string &line)
- :StatelessFeatureFunction(line)
- {}
+ SyntaxRHS(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
return true;
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
-
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
- virtual void Evaluate(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
+ void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(const ChartHypothesis &hypo,
+
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
+
};
}
diff --git a/moses/FF/TargetBigramFeature.cpp b/moses/FF/TargetBigramFeature.cpp
index 104f986e7..f1da62b7d 100644
--- a/moses/FF/TargetBigramFeature.cpp
+++ b/moses/FF/TargetBigramFeature.cpp
@@ -64,7 +64,7 @@ const FFState* TargetBigramFeature::EmptyHypothesisState(const InputType &/*inpu
return new TargetBigramState(m_bos);
}
-FFState* TargetBigramFeature::Evaluate(const Hypothesis& cur_hypo,
+FFState* TargetBigramFeature::EvaluateWhenApplied(const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
diff --git a/moses/FF/TargetBigramFeature.h b/moses/FF/TargetBigramFeature.h
index fe2500ad2..c63f3caa4 100644
--- a/moses/FF/TargetBigramFeature.h
+++ b/moses/FF/TargetBigramFeature.h
@@ -39,22 +39,22 @@ public:
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
- virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
+ virtual FFState* EvaluateWhenApplied(const Hypothesis& cur_hypo, const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- virtual FFState* EvaluateChart( const ChartHypothesis& /* cur_hypo */,
+ virtual FFState* EvaluateWhenApplied( const ChartHypothesis& /* cur_hypo */,
int /* featureID */,
ScoreComponentCollection* ) const {
throw std::logic_error("TargetBigramFeature not valid in chart decoder");
}
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/TargetNgramFeature.cpp b/moses/FF/TargetNgramFeature.cpp
index c269225f8..a810a742c 100644
--- a/moses/FF/TargetNgramFeature.cpp
+++ b/moses/FF/TargetNgramFeature.cpp
@@ -95,7 +95,7 @@ const FFState* TargetNgramFeature::EmptyHypothesisState(const InputType &/*input
return new TargetNgramState(bos);
}
-FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo,
+FFState* TargetNgramFeature::EvaluateWhenApplied(const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
@@ -207,7 +207,7 @@ void TargetNgramFeature::appendNgram(const Word& word, bool& skip, stringstream
}
}
-FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureId, ScoreComponentCollection* accumulator) const
+FFState* TargetNgramFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureId, ScoreComponentCollection* accumulator) const
{
vector<const Word*> contextFactor;
contextFactor.reserve(m_n);
diff --git a/moses/FF/TargetNgramFeature.h b/moses/FF/TargetNgramFeature.h
index 8e91a08b2..e87252670 100644
--- a/moses/FF/TargetNgramFeature.h
+++ b/moses/FF/TargetNgramFeature.h
@@ -186,20 +186,20 @@ public:
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
- virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
+ virtual FFState* EvaluateWhenApplied(const Hypothesis& cur_hypo, const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- virtual FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureId,
+ virtual FFState* EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureId,
ScoreComponentCollection* accumulator) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/TargetWordInsertionFeature.cpp b/moses/FF/TargetWordInsertionFeature.cpp
index 7bb1ae6e9..09a7b4472 100644
--- a/moses/FF/TargetWordInsertionFeature.cpp
+++ b/moses/FF/TargetWordInsertionFeature.cpp
@@ -53,7 +53,7 @@ void TargetWordInsertionFeature::Load()
m_unrestricted = false;
}
-void TargetWordInsertionFeature::Evaluate(const Phrase &source
+void TargetWordInsertionFeature::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
@@ -73,11 +73,7 @@ void TargetWordInsertionFeature::ComputeFeatures(const Phrase &source,
if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
// flag aligned words
- bool aligned[16];
- UTIL_THROW_IF2(targetLength >= 16, "Target length must be less than 16 words");
- for(size_t i=0; i<targetLength; i++) {
- aligned[i] = false;
- }
+ std::vector<bool> aligned(targetLength, false);
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++) {
aligned[ alignmentPoint->second ] = true;
}
diff --git a/moses/FF/TargetWordInsertionFeature.h b/moses/FF/TargetWordInsertionFeature.h
index eedde61b2..06fa25400 100644
--- a/moses/FF/TargetWordInsertionFeature.h
+++ b/moses/FF/TargetWordInsertionFeature.h
@@ -28,21 +28,21 @@ public:
void Load();
- virtual void Evaluate(const Phrase &source
+ virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
diff --git a/moses/FF/TreeStructureFeature.cpp b/moses/FF/TreeStructureFeature.cpp
index 9e7dd6b12..38c21109d 100644
--- a/moses/FF/TreeStructureFeature.cpp
+++ b/moses/FF/TreeStructureFeature.cpp
@@ -1,250 +1,14 @@
#include "TreeStructureFeature.h"
#include "moses/StaticData.h"
#include "moses/ScoreComponentCollection.h"
-#include "moses/Hypothesis.h"
#include "moses/ChartHypothesis.h"
-#include "moses/TargetPhrase.h"
-#include <boost/shared_ptr.hpp>
#include <vector>
#include "moses/PP/TreeStructurePhraseProperty.h"
-using namespace std;
-
namespace Moses
{
-InternalTree::InternalTree(const std::string & line, const bool terminal):
- m_value_nt(0),
- m_isTerminal(terminal)
-{
-
- size_t found = line.find_first_of("[] ");
-
- if (found == line.npos) {
- m_value = line;
- }
-
- else {
- AddSubTree(line, 0);
- }
-}
-
-size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
-{
-
- std::string value = "";
- char token = 0;
-
- while (token != ']' && pos != std::string::npos) {
- size_t oldpos = pos;
- pos = line.find_first_of("[] ", pos);
- if (pos == std::string::npos) break;
- token = line[pos];
- value = line.substr(oldpos,pos-oldpos);
-
- if (token == '[') {
- if (m_value.size() > 0) {
- TreePointer child(new InternalTree(value, false));
- m_children.push_back(child);
- pos = child->AddSubTree(line, pos+1);
- } else {
- if (value.size() > 0) {
- m_value = value;
- }
- pos = AddSubTree(line, pos+1);
- }
- } else if (token == ' ' || token == ']') {
- if (value.size() > 0 && ! m_value.size() > 0) {
- m_value = value;
- } else if (value.size() > 0) {
- m_isTerminal = false;
- TreePointer child(new InternalTree(value, true));
- m_children.push_back(child);
- }
- if (token == ' ') {
- pos++;
- }
- }
-
- if (m_children.size() > 0) {
- m_isTerminal = false;
- }
- }
-
- if (pos == std::string::npos) {
- return line.size();
- }
- return min(line.size(),pos+1);
-
-}
-
-std::string InternalTree::GetString() const
-{
-
- std::string ret = " ";
-
- if (!m_isTerminal) {
- ret += "[";
- }
-
- ret += m_value;
- for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it) {
- ret += (*it)->GetString();
- }
-
- if (!m_isTerminal) {
- ret += "]";
- }
- return ret;
-
-}
-
-
-void InternalTree::Combine(const std::vector<TreePointer> &previous)
-{
-
- std::vector<TreePointer>::iterator it;
- bool found = false;
- leafNT next_leafNT(this);
- for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
- found = next_leafNT(it);
- if (found) {
- *it = *it_prev;
- } else {
- std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
- }
- }
-}
-
-
-bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
-{
- for (it = m_children.begin(); it != m_children.end(); ++it) {
- if ((*it)->GetLabel() == label) {
- return true;
- }
- }
- return false;
-}
-
-bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
-{
- for (it = m_children.begin(); it != m_children.end(); ++it) {
- if ((*it)->GetLabel() == label) {
- return true;
- }
- std::vector<TreePointer>::const_iterator it2;
- if ((*it)->RecursiveSearch(label, it2)) {
- it = it2;
- return true;
- }
- }
- return false;
-}
-
-bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
-{
- for (it = m_children.begin(); it != m_children.end(); ++it) {
- if ((*it)->GetLabel() == label) {
- parent = this;
- return true;
- }
- std::vector<TreePointer>::const_iterator it2;
- if ((*it)->RecursiveSearch(label, it2, parent)) {
- it = it2;
- return true;
- }
- }
- return false;
-}
-
-
-bool InternalTree::FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
-{
- for (it = m_children.begin(); it != m_children.end(); ++it) {
- if ((*it)->GetNTLabel() == label) {
- return true;
- }
- }
- return false;
-}
-
-bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
-{
- for (it = m_children.begin(); it != m_children.end(); ++it) {
- if ((*it)->GetNTLabel() == label) {
- return true;
- }
- std::vector<TreePointer>::const_iterator it2;
- if ((*it)->RecursiveSearch(label, it2)) {
- it = it2;
- return true;
- }
- }
- return false;
-}
-
-bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
-{
- for (it = m_children.begin(); it != m_children.end(); ++it) {
- if ((*it)->GetNTLabel() == label) {
- parent = this;
- return true;
- }
- std::vector<TreePointer>::const_iterator it2;
- if ((*it)->RecursiveSearch(label, it2, parent)) {
- it = it2;
- return true;
- }
- }
- return false;
-}
-
-
-bool InternalTree::FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
-{
- for (it = m_children.begin(); it != m_children.end(); ++it) {
- if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
- return true;
- }
- }
- return false;
-}
-
-bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
-{
- for (it = m_children.begin(); it != m_children.end(); ++it) {
- if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
- return true;
- }
- std::vector<TreePointer>::const_iterator it2;
- if ((*it)->RecursiveSearch(labels, it2)) {
- it = it2;
- return true;
- }
- }
- return false;
-}
-
-bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
-{
- for (it = m_children.begin(); it != m_children.end(); ++it) {
- if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
- parent = this;
- return true;
- }
- std::vector<TreePointer>::const_iterator it2;
- if ((*it)->RecursiveSearch(labels, it2, parent)) {
- it = it2;
- return true;
- }
- }
- return false;
-}
-
-
-void TreeStructureFeature::Load()
-{
+void TreeStructureFeature::Load() {
// syntactic constraints can be hooked in here.
m_constraints = NULL;
@@ -256,36 +20,34 @@ void TreeStructureFeature::Load()
// define NT labels (ints) that are mapped from strings for quicker comparison.
-void TreeStructureFeature::AddNTLabels(TreePointer root) const
-{
- std::string label = root->GetLabel();
+void TreeStructureFeature::AddNTLabels(TreePointer root) const {
+ std::string label = root->GetLabel();
- if (root->IsTerminal()) {
- return;
- }
+ if (root->IsTerminal()) {
+ return;
+ }
- std::map<std::string, NTLabel>::const_iterator it = m_labelset->string_to_label.find(label);
- if (it != m_labelset->string_to_label.end()) {
- root->SetNTLabel(it->second);
- }
+ std::map<std::string, NTLabel>::const_iterator it = m_labelset->string_to_label.find(label);
+ if (it != m_labelset->string_to_label.end()) {
+ root->SetNTLabel(it->second);
+ }
- std::vector<TreePointer> children = root->GetChildren();
- for (std::vector<TreePointer>::const_iterator it2 = children.begin(); it2 != children.end(); ++it2) {
- AddNTLabels(*it2);
- }
+ std::vector<TreePointer> children = root->GetChildren();
+ for (std::vector<TreePointer>::const_iterator it2 = children.begin(); it2 != children.end(); ++it2) {
+ AddNTLabels(*it2);
+ }
}
-FFState* TreeStructureFeature::EvaluateChart(const ChartHypothesis& cur_hypo
- , int featureID /* used to index the state in the previous hypotheses */
- , ScoreComponentCollection* accumulator) const
+FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
+ , int featureID /* used to index the state in the previous hypotheses */
+ , ScoreComponentCollection* accumulator) const
{
- boost::shared_ptr<PhraseProperty> property;
- if (cur_hypo.GetCurrTargetPhrase().GetProperty("Tree", property)) {
- const std::string &tree = property->GetValueString();
- TreePointer mytree (new InternalTree(tree));
+ if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) {
+ const std::string *tree = property->GetValueString();
+ TreePointer mytree (boost::make_shared<InternalTree>(*tree));
if (m_labelset) {
- AddNTLabels(mytree);
+ AddNTLabels(mytree);
}
//get subtrees (in target order)
@@ -301,22 +63,17 @@ FFState* TreeStructureFeature::EvaluateChart(const ChartHypothesis& cur_hypo
}
}
- std::vector<std::string> sparse_features;
if (m_constraints) {
- sparse_features = m_constraints->SyntacticRules(mytree, previous_trees);
+ m_constraints->SyntacticRules(mytree, previous_trees, this, accumulator);
}
mytree->Combine(previous_trees);
- //sparse scores
- for (std::vector<std::string>::const_iterator feature=sparse_features.begin(); feature != sparse_features.end(); ++feature) {
- accumulator->PlusEquals(this, *feature, 1);
- }
return new TreeState(mytree);
- } else {
+ }
+ else {
UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
}
}
}
-
diff --git a/moses/FF/TreeStructureFeature.h b/moses/FF/TreeStructureFeature.h
index 76f06d4de..8dae3e001 100644
--- a/moses/FF/TreeStructureFeature.h
+++ b/moses/FF/TreeStructureFeature.h
@@ -4,134 +4,33 @@
#include <map>
#include "StatefulFeatureFunction.h"
#include "FFState.h"
-#include <boost/shared_ptr.hpp>
-#include "util/generator.hh"
-#include "util/exception.hh"
+#include "InternalTree.h"
namespace Moses
{
-class InternalTree;
-typedef boost::shared_ptr<InternalTree> TreePointer;
typedef int NTLabel;
-class InternalTree
-{
- std::string m_value;
- NTLabel m_value_nt;
- std::vector<TreePointer> m_children;
- bool m_isTerminal;
-public:
- InternalTree(const std::string & line, const bool terminal = false);
- InternalTree(const InternalTree & tree):
- m_value(tree.m_value),
- m_isTerminal(tree.m_isTerminal) {
- const std::vector<TreePointer> & children = tree.m_children;
- for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
- TreePointer child (new InternalTree(**it));
- m_children.push_back(child);
- }
- }
- size_t AddSubTree(const std::string & line, size_t start);
-
- std::string GetString() const;
- void Combine(const std::vector<TreePointer> &previous);
- const std::string & GetLabel() const {
- return m_value;
- }
-
- // optionally identify label by int instead of string;
- // allows abstraction if multiple nonterminal strings should map to same label.
- const NTLabel & GetNTLabel() const {
- return m_value_nt;
- }
-
- void SetNTLabel(NTLabel value) {
- m_value_nt = value;
- }
-
- size_t GetLength() const {
- return m_children.size();
- }
- std::vector<TreePointer> & GetChildren() {
- return m_children;
- }
- void AddChild(TreePointer child) {
- m_children.push_back(child);
- }
-
- bool IsTerminal() const {
- return m_isTerminal;
- }
-
- bool IsLeafNT() const {
- return (!m_isTerminal && m_children.size() == 0);
- }
-
- // different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents.
- // can be used for formulating syntax constraints.
-
- // if found, 'it' is iterator to first tree node that matches search string
- bool FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
- bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
-
- // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
- bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
-
- // use NTLabel for search to reduce number of string comparisons / deal with synonymous labels
- // if found, 'it' is iterator to first tree node that matches search string
- bool FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
- bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
-
- // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
- bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
-
- // pass vector of possible labels to search
- // if found, 'it' is iterator to first tree node that matches search string
- bool FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
- bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
-
- // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
- bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
-
-
-};
// mapping from string nonterminal label to int representation.
// allows abstraction if multiple nonterminal strings should map to same label.
-struct LabelSet {
+struct LabelSet
+{
public:
- std::map<std::string, NTLabel> string_to_label;
+ std::map<std::string, NTLabel> string_to_label;
};
// class to implement language-specific syntactic constraints.
-// the method SyntacticRules must return a vector of strings (each identifying a constraint violation), which are then made into sparse features.
+// the method SyntacticRules is given pointer to ScoreComponentCollection, so it can add sparse features itself.
class SyntaxConstraints
{
public:
- virtual std::vector<std::string> SyntacticRules(TreePointer root, const std::vector<TreePointer> &previous) = 0;
- virtual ~SyntaxConstraints() {};
+ virtual void SyntacticRules(TreePointer root, const std::vector<TreePointer> &previous, const FeatureFunction* sp, ScoreComponentCollection* accumulator) = 0;
+ virtual ~SyntaxConstraints() {};
};
-class TreeState : public FFState
-{
- TreePointer m_tree;
-public:
- TreeState(TreePointer tree)
- :m_tree(tree)
- {}
-
- TreePointer GetTree() const {
- return m_tree;
- }
-
- int Compare(const FFState& other) const {
- return 0;
- };
-};
-
class TreeStructureFeature : public StatefulFeatureFunction
{
SyntaxConstraints* m_constraints;
@@ -139,11 +38,9 @@ class TreeStructureFeature : public StatefulFeatureFunction
public:
TreeStructureFeature(const std::string &line)
:StatefulFeatureFunction(0, line) {
- ReadParameters();
- }
- ~TreeStructureFeature() {
- delete m_constraints;
- };
+ ReadParameters();
+ }
+ ~TreeStructureFeature() {delete m_constraints;};
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
return new TreeState(TreePointer());
@@ -155,23 +52,21 @@ public:
return true;
}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const {};
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const {};
- FFState* Evaluate(
+ FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
- ScoreComponentCollection* accumulator) const {
- UTIL_THROW(util::Exception, "Not implemented");
- };
- FFState* EvaluateChart(
+ ScoreComponentCollection* accumulator) const {UTIL_THROW(util::Exception, "Not implemented");};
+ FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
@@ -179,45 +74,5 @@ public:
void Load();
};
-// Python-like generator that yields next nonterminal leaf on every call
-$generator(leafNT)
-{
- std::vector<TreePointer>::iterator it;
- InternalTree* tree;
- leafNT(InternalTree* root = 0): tree(root) {}
- $emit(std::vector<TreePointer>::iterator)
- for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
- if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
- $yield(it);
- } else if ((*it)->GetLength() > 0) {
- if (&(**it)) { // normal pointer to same object that TreePointer points to
- $restart(tree = &(**it));
- }
- }
- }
- $stop;
-};
-
-
-// Python-like generator that yields the parent of the next nonterminal leaf on every call
-$generator(leafNTParent)
-{
- std::vector<TreePointer>::iterator it;
- InternalTree* tree;
- leafNTParent(InternalTree* root = 0): tree(root) {}
- $emit(InternalTree*)
- for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
- if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
- $yield(tree);
- } else if ((*it)->GetLength() > 0) {
- if (&(**it)) { // normal pointer to same object that TreePointer points to
- $restart(tree = &(**it));
- }
- }
- }
- $stop;
-};
-
}
-
diff --git a/moses/FF/UnknownWordPenaltyProducer.h b/moses/FF/UnknownWordPenaltyProducer.h
index 3b48f4380..1aa6cbbcf 100644
--- a/moses/FF/UnknownWordPenaltyProducer.h
+++ b/moses/FF/UnknownWordPenaltyProducer.h
@@ -31,20 +31,23 @@ public:
}
std::vector<float> DefaultWeights() const;
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
- void Evaluate(const InputType &input
+ void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
+ ScoreComponentCollection* accumulator) const
+ {}
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/WordPenaltyProducer.cpp b/moses/FF/WordPenaltyProducer.cpp
index 6dea01b72..1e191d040 100644
--- a/moses/FF/WordPenaltyProducer.cpp
+++ b/moses/FF/WordPenaltyProducer.cpp
@@ -17,7 +17,7 @@ WordPenaltyProducer::WordPenaltyProducer(const std::string &line)
s_instance = this;
}
-void WordPenaltyProducer::Evaluate(const Phrase &source
+void WordPenaltyProducer::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/FF/WordPenaltyProducer.h b/moses/FF/WordPenaltyProducer.h
index a21904788..b17901c6b 100644
--- a/moses/FF/WordPenaltyProducer.h
+++ b/moses/FF/WordPenaltyProducer.h
@@ -27,17 +27,20 @@ public:
return true;
}
- virtual void Evaluate(const Phrase &source
+ virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const Hypothesis& hypo,
+ void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const
{}
- void EvaluateChart(const ChartHypothesis &hypo,
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const
{}
- void Evaluate(const InputType &input
+ void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
+ ScoreComponentCollection* accumulator) const
+ {}
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
diff --git a/moses/FF/WordTranslationFeature.cpp b/moses/FF/WordTranslationFeature.cpp
index 554107c32..5111e677e 100644
--- a/moses/FF/WordTranslationFeature.cpp
+++ b/moses/FF/WordTranslationFeature.cpp
@@ -137,18 +137,19 @@ void WordTranslationFeature::Load()
}
}
-void WordTranslationFeature::Evaluate
-(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const
+void WordTranslationFeature::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
{
- const Sentence& input = static_cast<const Sentence&>(hypo.GetInput());
- const TranslationOption& transOpt = hypo.GetTranslationOption();
- const TargetPhrase& targetPhrase = hypo.GetCurrTargetPhrase();
+ const Sentence& sentence = static_cast<const Sentence&>(input);
const AlignmentInfo &alignment = targetPhrase.GetAlignTerm();
// process aligned words
for (AlignmentInfo::const_iterator alignmentPoint = alignment.begin(); alignmentPoint != alignment.end(); alignmentPoint++) {
- const Phrase& sourcePhrase = transOpt.GetInputPath().GetPhrase();
+ const Phrase& sourcePhrase = inputPath.GetPhrase();
int sourceIndex = alignmentPoint->first;
int targetIndex = alignmentPoint->second;
Word ws = sourcePhrase.GetWord(sourceIndex);
@@ -183,15 +184,15 @@ void WordTranslationFeature::Evaluate
featureName << sourceWord;
featureName << "~";
featureName << targetWord;
- accumulator->SparsePlusEquals(featureName.str(), 1);
+ scoreBreakdown.SparsePlusEquals(featureName.str(), 1);
}
if (m_domainTrigger && !m_sourceContext) {
- const bool use_topicid = input.GetUseTopicId();
- const bool use_topicid_prob = input.GetUseTopicIdAndProb();
+ const bool use_topicid = sentence.GetUseTopicId();
+ const bool use_topicid_prob = sentence.GetUseTopicIdAndProb();
if (use_topicid || use_topicid_prob) {
if(use_topicid) {
// use topicid as trigger
- const long topicid = input.GetTopicId();
+ const long topicid = sentence.GetTopicId();
stringstream feature;
feature << m_description << "_";
if (topicid == -1)
@@ -203,7 +204,7 @@ void WordTranslationFeature::Evaluate
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
// use topic probabilities
const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
@@ -213,7 +214,7 @@ void WordTranslationFeature::Evaluate
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
stringstream feature;
@@ -223,7 +224,7 @@ void WordTranslationFeature::Evaluate
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
+ scoreBreakdown.SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
}
}
}
@@ -239,12 +240,12 @@ void WordTranslationFeature::Evaluate
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
}
}
}
if (m_sourceContext) {
- size_t globalSourceIndex = hypo.GetTranslationOption().GetStartPos() + sourceIndex;
+ size_t globalSourceIndex = inputPath.GetWordsRange().GetStartPos() + sourceIndex;
if (!m_domainTrigger && globalSourceIndex == 0) {
// add <s> trigger feature for source
stringstream feature;
@@ -253,7 +254,7 @@ void WordTranslationFeature::Evaluate
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
}
// range over source words to get context
@@ -284,7 +285,7 @@ void WordTranslationFeature::Evaluate
feature << sourceWord;
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
}
} else if (m_unrestricted || sourceTriggerExists) {
stringstream feature;
@@ -300,7 +301,7 @@ void WordTranslationFeature::Evaluate
}
feature << "~";
feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ scoreBreakdown.SparsePlusEquals(feature.str(), 1);
}
}
}
@@ -349,13 +350,6 @@ void WordTranslationFeature::Evaluate
}
}
-void WordTranslationFeature::EvaluateChart(
- const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const
-{
- UTIL_THROW(util::Exception, "Need source phrase. Can't be arsed at the moment");
-}
-
bool WordTranslationFeature::IsUseable(const FactorMask &mask) const
{
bool ret = mask[m_factorTypeTarget];
diff --git a/moses/FF/WordTranslationFeature.h b/moses/FF/WordTranslationFeature.h
index 072ba1d6a..63e3749c7 100644
--- a/moses/FF/WordTranslationFeature.h
+++ b/moses/FF/WordTranslationFeature.h
@@ -48,24 +48,27 @@ public:
return new DummyState();
}
- void Evaluate(const Hypothesis& hypo,
- ScoreComponentCollection* accumulator) const;
-
- void EvaluateChart(const ChartHypothesis &hypo,
- ScoreComponentCollection* accumulator) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection *estimatedFutureScore = NULL) const
- {}
- void Evaluate(const Phrase &source
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
+ void EvaluateWhenApplied(const Hypothesis& hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
};
}
diff --git a/moses/FactorCollection.cpp b/moses/FactorCollection.cpp
index 93edeff51..a29778310 100644
--- a/moses/FactorCollection.cpp
+++ b/moses/FactorCollection.cpp
@@ -66,6 +66,23 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString, bool
return &ret.first->in;
}
+const Factor *FactorCollection::GetFactor(const StringPiece &factorString, bool isNonTerminal)
+{
+ FactorFriend to_find;
+ to_find.in.m_string = factorString;
+ to_find.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
+ Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
+ { // read=lock scope
+#ifdef WITH_THREADS
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif // WITH_THREADS
+ Set::const_iterator i = set.find(to_find);
+ if (i != set.end()) return &i->in;
+ }
+ return NULL;
+}
+
+
FactorCollection::~FactorCollection() {}
TO_STRING_BODY(FactorCollection);
diff --git a/moses/FactorCollection.h b/moses/FactorCollection.h
index 400b6aa64..b5f49f3cf 100644
--- a/moses/FactorCollection.h
+++ b/moses/FactorCollection.h
@@ -114,6 +114,8 @@ public:
return m_factorIdNonTerminal;
}
+ const Factor *GetFactor(const StringPiece &factorString, bool isNonTerminal = false);
+
// TODO: remove calls to this function, replacing them with the simpler AddFactor(factorString)
const Factor *AddFactor(FactorDirection /*direction*/, FactorType /*factorType*/, const StringPiece &factorString, bool isNonTerminal = false) {
return AddFactor(factorString, isNonTerminal);
diff --git a/moses/FeatureVector.cpp b/moses/FeatureVector.cpp
index 536c1a720..ce4a043a6 100644
--- a/moses/FeatureVector.cpp
+++ b/moses/FeatureVector.cpp
@@ -214,10 +214,10 @@ void FVector::save(const string& filename) const
out.close();
}
-void FVector::write(ostream& out) const
+void FVector::write(ostream& out,const string& sep, const string& linesep) const
{
for (const_iterator i = cbegin(); i != cend(); ++i) {
- out << i->first << " " << i->second << endl;
+ out << i->first << sep << i->second << linesep;
}
}
diff --git a/moses/FeatureVector.h b/moses/FeatureVector.h
index 65f8bdc2e..fd5d3340c 100644
--- a/moses/FeatureVector.h
+++ b/moses/FeatureVector.h
@@ -167,7 +167,7 @@ public:
/** Load from file - each line should be 'root[_name] value' */
bool load(const std::string& filename);
void save(const std::string& filename) const;
- void write(std::ostream& out) const ;
+ void write(std::ostream& out, const std::string& sep=" ", const std::string& linesep="\n") const ;
/** Element access */
ProxyFVector operator[](const FName& name);
diff --git a/moses/HypergraphOutput.cpp b/moses/HypergraphOutput.cpp
new file mode 100644
index 000000000..481bca07e
--- /dev/null
+++ b/moses/HypergraphOutput.cpp
@@ -0,0 +1,252 @@
+// $Id$
+// vim:tabstop=2
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filter/bzip2.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+
+#include <util/exception.hh>
+
+#include "ChartHypothesisCollection.h"
+#include "ChartManager.h"
+#include "HypergraphOutput.h"
+#include "Manager.h"
+
+using namespace std;
+
+namespace Moses {
+
+template<class M>
+HypergraphOutput<M>::HypergraphOutput(size_t precision) :
+ m_precision(precision) {
+ const StaticData& staticData = StaticData::Instance();
+ vector<string> hypergraphParameters;
+ const PARAM_VEC *params = staticData.GetParameter().GetParam("output-search-graph-hypergraph");
+ if (params) {
+ hypergraphParameters = *params;
+ }
+
+ if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
+ m_appendSuffix = true;
+ } else {
+ m_appendSuffix = false;
+ }
+
+ string compression;
+ if (hypergraphParameters.size() > 1) {
+ m_compression = hypergraphParameters[1];
+ } else {
+ m_compression = "txt";
+ }
+ UTIL_THROW_IF(m_compression != "txt" && m_compression != "gz" && m_compression != "bz2",
+ util::Exception, "Unknown compression type: " << m_compression);
+
+ if ( hypergraphParameters.size() > 2 ) {
+ m_hypergraphDir = hypergraphParameters[2];
+ } else {
+ string nbestFile = staticData.GetNBestFilePath();
+ if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
+ boost::filesystem::path nbestPath(nbestFile);
+
+ // In the Boost filesystem API version 2,
+ // which was the default prior to Boost 1.46,
+ // the filename() method returned a string.
+ //
+ // In the Boost filesystem API version 3,
+ // which is the default starting with Boost 1.46,
+ // the filename() method returns a path object.
+ //
+ // To get a string from the path object,
+ // the native() method must be called.
+ // hypergraphDir = nbestPath.parent_path().filename()
+ //#if BOOST_VERSION >= 104600
+ // .native()
+ //#endif
+ //;
+
+ // Hopefully the following compiles under all versions of Boost.
+ //
+ // If this line gives you compile errors,
+ // contact Lane Schwartz on the Moses mailing list
+ m_hypergraphDir = nbestPath.parent_path().string();
+
+ } else {
+ stringstream hypergraphDirName;
+ hypergraphDirName << boost::filesystem::current_path().string() << "/hypergraph";
+ m_hypergraphDir = hypergraphDirName.str();
+ }
+ }
+
+ if ( ! boost::filesystem::exists(m_hypergraphDir) ) {
+ boost::filesystem::create_directory(m_hypergraphDir);
+ }
+
+ UTIL_THROW_IF(!boost::filesystem::is_directory(m_hypergraphDir),
+ util::Exception, "Cannot output hypergraphs to " << m_hypergraphDir << " because that path exists, but is not a directory");
+
+
+ ofstream weightsOut;
+ stringstream weightsFilename;
+ weightsFilename << m_hypergraphDir << "/weights";
+
+ TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
+ weightsOut.open(weightsFilename.str().c_str());
+ weightsOut.setf(std::ios::fixed);
+ weightsOut.precision(6);
+ staticData.GetAllWeights().Save(weightsOut);
+ weightsOut.close();
+}
+
+template<class M>
+void HypergraphOutput<M>::Write(const M& manager) const {
+
+ stringstream fileName;
+ fileName << m_hypergraphDir << "/" << manager.GetSource().GetTranslationId();
+ if ( m_appendSuffix ) {
+ fileName << "." << m_compression;
+ }
+ boost::iostreams::filtering_ostream file;
+
+ if ( m_compression == "gz" ) {
+ file.push( boost::iostreams::gzip_compressor() );
+ } else if ( m_compression == "bz2" ) {
+ file.push( boost::iostreams::bzip2_compressor() );
+ }
+
+ file.push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
+
+ if (file.is_complete() && file.good()) {
+ file.setf(std::ios::fixed);
+ file.precision(m_precision);
+ manager.OutputSearchGraphAsHypergraph(file);
+ file.flush();
+ } else {
+ TRACE_ERR("Cannot output hypergraph for line " << manager.GetSource().GetTranslationId()
+ << " because the output file " << fileName.str()
+ << " is not open or not ready for writing"
+ << std::endl);
+ }
+ file.pop();
+}
+
+template class HypergraphOutput<Manager>;
+template class HypergraphOutput<ChartManager>;
+
+
+void ChartSearchGraphWriterMoses::WriteHypos
+ (const ChartHypothesisCollection& hypos, const map<unsigned, bool> &reachable) const {
+
+ ChartHypothesisCollection::const_iterator iter;
+ for (iter = hypos.begin() ; iter != hypos.end() ; ++iter) {
+ ChartHypothesis &mainHypo = **iter;
+ if (StaticData::Instance().GetUnprunedSearchGraph() ||
+ reachable.find(mainHypo.GetId()) != reachable.end()) {
+ (*m_out) << m_lineNumber << " " << mainHypo << endl;
+ }
+
+ const ChartArcList *arcList = mainHypo.GetArcList();
+ if (arcList) {
+ ChartArcList::const_iterator iterArc;
+ for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
+ const ChartHypothesis &arc = **iterArc;
+ if (reachable.find(arc.GetId()) != reachable.end()) {
+ (*m_out) << m_lineNumber << " " << arc << endl;
+ }
+ }
+ }
+ }
+
+}
+void ChartSearchGraphWriterHypergraph::WriteHeader(size_t winners, size_t losers) const {
+
+ (*m_out) << "# target ||| features ||| source-covered" << endl;
+ (*m_out) << winners << " " << (winners+losers) << endl;
+
+}
+
+void ChartSearchGraphWriterHypergraph::WriteHypos(const ChartHypothesisCollection& hypos,
+ const map<unsigned, bool> &reachable) const {
+
+ ChartHypothesisCollection::const_iterator iter;
+ for (iter = hypos.begin() ; iter != hypos.end() ; ++iter) {
+ const ChartHypothesis* mainHypo = *iter;
+ if (!StaticData::Instance().GetUnprunedSearchGraph() &&
+ reachable.find(mainHypo->GetId()) == reachable.end()) {
+ //Ignore non reachable nodes
+ continue;
+ }
+ (*m_out) << "# node " << m_nodeId << endl;
+ m_hypoIdToNodeId[mainHypo->GetId()] = m_nodeId;
+ ++m_nodeId;
+ vector<const ChartHypothesis*> edges;
+ edges.push_back(mainHypo);
+ const ChartArcList *arcList = (*iter)->GetArcList();
+ if (arcList) {
+ ChartArcList::const_iterator iterArc;
+ for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
+ const ChartHypothesis* arc = *iterArc;
+ if (reachable.find(arc->GetId()) != reachable.end()) {
+ edges.push_back(arc);
+ }
+ }
+ }
+ (*m_out) << edges.size() << endl;
+ for (vector<const ChartHypothesis*>::const_iterator ei = edges.begin(); ei != edges.end(); ++ei) {
+ const ChartHypothesis* hypo = *ei;
+ const TargetPhrase& target = hypo->GetCurrTargetPhrase();
+ size_t ntIndex = 0;
+ for (size_t i = 0; i < target.GetSize(); ++i) {
+ const Word& word = target.GetWord(i);
+ if (word.IsNonTerminal()) {
+ size_t hypoId = hypo->GetPrevHypos()[ntIndex++]->GetId();
+ (*m_out) << "[" << m_hypoIdToNodeId[hypoId] << "]";
+ } else {
+ (*m_out) << word.GetFactor(0)->GetString();
+ }
+ (*m_out) << " ";
+ }
+ (*m_out) << " ||| ";
+ ScoreComponentCollection scores = hypo->GetScoreBreakdown();
+ HypoList::const_iterator hi;
+ for (hi = hypo->GetPrevHypos().begin(); hi != hypo->GetPrevHypos().end(); ++hi) {
+ scores.MinusEquals((*hi)->GetScoreBreakdown());
+ }
+ scores.Save(*m_out, false);
+ (*m_out) << " ||| ";
+ (*m_out) << hypo->GetCurrSourceRange().GetNumWordsCovered();
+ (*m_out) << endl;
+
+ }
+ }
+}
+
+
+} //namespace Moses
+
diff --git a/moses/HypergraphOutput.h b/moses/HypergraphOutput.h
new file mode 100644
index 000000000..4ec8e2665
--- /dev/null
+++ b/moses/HypergraphOutput.h
@@ -0,0 +1,95 @@
+// $Id$
+// vim:tabstop=2
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2014- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#ifndef moses_Hypergraph_Output_h
+#define moses_Hypergraph_Output_h
+
+#include <ostream>
+
+/**
+* Manage the output of hypergraphs.
+**/
+
+namespace Moses {
+
+class ChartHypothesisCollection;
+
+template<class M>
+class HypergraphOutput {
+
+public:
+ /** Initialise output directory and create weights file */
+ HypergraphOutput(size_t precision);
+
+ /** Write this hypergraph to file */
+ void Write(const M& manager) const;
+
+private:
+ size_t m_precision;
+ std::string m_hypergraphDir;
+ std::string m_compression;
+ bool m_appendSuffix;
+};
+
+
+/**
+ * ABC for different types of search graph output for chart Moses.
+**/
+class ChartSearchGraphWriter {
+public:
+ virtual void WriteHeader(size_t winners, size_t losers) const = 0;
+ virtual void WriteHypos(const ChartHypothesisCollection& hypos,
+ const std::map<unsigned, bool> &reachable) const = 0;
+
+};
+
+/** "Moses" format (osg style) */
+class ChartSearchGraphWriterMoses : public virtual ChartSearchGraphWriter {
+public:
+ ChartSearchGraphWriterMoses(std::ostream* out, size_t lineNumber) :
+ m_out(out), m_lineNumber(lineNumber) {}
+ virtual void WriteHeader(size_t, size_t) const {/* do nothing */}
+ virtual void WriteHypos(const ChartHypothesisCollection& hypos,
+ const std::map<unsigned, bool> &reachable) const;
+
+private:
+ std::ostream* m_out;
+ size_t m_lineNumber;
+};
+
+/** Modified version of Kenneth's lazy hypergraph format */
+class ChartSearchGraphWriterHypergraph : public virtual ChartSearchGraphWriter {
+public:
+ ChartSearchGraphWriterHypergraph(std::ostream* out) :
+ m_out(out), m_nodeId(0) {}
+ virtual void WriteHeader(size_t winners, size_t losers) const;
+ virtual void WriteHypos(const ChartHypothesisCollection& hypos,
+ const std::map<unsigned, bool> &reachable) const;
+
+private:
+ std::ostream* m_out;
+ mutable size_t m_nodeId;
+ mutable std::map<size_t,size_t> m_hypoIdToNodeId;
+};
+
+}
+#endif
diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp
index 12adb52a4..7f6530e00 100644
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@@ -47,7 +47,7 @@ ObjectPool<Hypothesis> Hypothesis::s_objectPool("Hypothesis", 300000);
Hypothesis::Hypothesis(Manager& manager, InputType const& source, const TranslationOption &initialTransOpt)
: m_prevHypo(NULL)
- , m_sourceCompleted(source.GetSize(), manager.m_source.m_sourceCompleted)
+ , m_sourceCompleted(source.GetSize(), manager.GetSource().m_sourceCompleted)
, m_sourceInput(source)
, m_currSourceWordsRange(
m_sourceCompleted.GetFirstGapPos()>0 ? 0 : NOT_FOUND,
@@ -85,14 +85,13 @@ Hypothesis::Hypothesis(const Hypothesis &prevHypo, const TranslationOption &tran
, m_wordDeleted(false)
, m_totalScore(0.0f)
, m_futureScore(0.0f)
- , m_scoreBreakdown(prevHypo.GetScoreBreakdown())
, m_ffStates(prevHypo.m_ffStates.size())
, m_arcList(NULL)
, m_transOpt(transOpt)
, m_manager(prevHypo.GetManager())
, m_id(m_manager.GetNextHypoId())
{
- m_scoreBreakdown.PlusEquals(transOpt.GetScoreBreakdown());
+ m_currScoreBreakdown.PlusEquals(transOpt.GetScoreBreakdown());
// assert that we are not extending our hypothesis by retranslating something
// that this hypothesis has already translated!
@@ -206,34 +205,30 @@ int Hypothesis::RecombineCompare(const Hypothesis &compare) const
return 0;
}
-void Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff,
+void Hypothesis::EvaluateWhenApplied(const StatefulFeatureFunction &sfff,
int state_idx)
{
- cerr << "Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff, int state_idx) START" << endl;
- cerr << "sfff Description:|" << sfff.GetScoreProducerDescription() << "|" << endl;
const StaticData &staticData = StaticData::Instance();
if (! staticData.IsFeatureFunctionIgnored( sfff )) {
- cerr << "feature is not ignored" << endl;
- m_ffStates[state_idx] = sfff.Evaluate(
+ m_ffStates[state_idx] = sfff.EvaluateWhenApplied(
*this,
m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
- &m_scoreBreakdown);
+ &m_currScoreBreakdown);
}
- cerr << "Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff, int state_idx) END" << endl;
}
-void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff)
+void Hypothesis::EvaluateWhenApplied(const StatelessFeatureFunction& slff)
{
const StaticData &staticData = StaticData::Instance();
if (! staticData.IsFeatureFunctionIgnored( slff )) {
- slff.Evaluate(*this, &m_scoreBreakdown);
+ slff.EvaluateWhenApplied(*this, &m_currScoreBreakdown);
}
}
/***
* calculate the logarithm of our total translation score (sum up components)
*/
-void Hypothesis::Evaluate(const SquareMatrix &futureScore)
+void Hypothesis::EvaluateWhenApplied(const SquareMatrix &futureScore)
{
IFVERBOSE(2) {
m_manager.GetSentenceStats().StartTimeOtherScore();
@@ -249,7 +244,7 @@ void Hypothesis::Evaluate(const SquareMatrix &futureScore)
StatelessFeatureFunction::GetStatelessFeatureFunctions();
for (unsigned i = 0; i < sfs.size(); ++i) {
const StatelessFeatureFunction &ff = *sfs[i];
- EvaluateWith(ff);
+ EvaluateWhenApplied(ff);
}
const vector<const StatefulFeatureFunction*>& ffs =
@@ -258,9 +253,9 @@ void Hypothesis::Evaluate(const SquareMatrix &futureScore)
const StatefulFeatureFunction &ff = *ffs[i];
const StaticData &staticData = StaticData::Instance();
if (! staticData.IsFeatureFunctionIgnored(ff)) {
- m_ffStates[i] = ff.Evaluate(*this,
+ m_ffStates[i] = ff.EvaluateWhenApplied(*this,
m_prevHypo ? m_prevHypo->m_ffStates[i] : NULL,
- &m_scoreBreakdown);
+ &m_currScoreBreakdown);
}
}
@@ -273,7 +268,8 @@ void Hypothesis::Evaluate(const SquareMatrix &futureScore)
m_futureScore = futureScore.CalcFutureScore( m_sourceCompleted );
// TOTAL
- m_totalScore = m_scoreBreakdown.GetWeightedScore() + m_futureScore;
+ m_totalScore = m_currScoreBreakdown.GetWeightedScore() + m_futureScore;
+ if (m_prevHypo) m_totalScore += m_prevHypo->GetScore();
IFVERBOSE(2) {
m_manager.GetSentenceStats().StopTimeEstimateScore();
@@ -319,7 +315,7 @@ void Hypothesis::PrintHypothesis() const
// TRACE_ERR( "\tlanguage model cost "); // <<m_score[ScoreType::LanguageModelScore]<<endl;
// TRACE_ERR( "\tword penalty "); // <<(m_score[ScoreType::WordPenalty]*weightWordPenalty)<<endl;
TRACE_ERR( "\tscore "<<m_totalScore - m_futureScore<<" + future cost "<<m_futureScore<<" = "<<m_totalScore<<endl);
- TRACE_ERR( "\tunweighted feature scores: " << m_scoreBreakdown << endl);
+ TRACE_ERR( "\tunweighted feature scores: " << m_currScoreBreakdown << endl);
//PrintLMScores();
}
@@ -336,14 +332,14 @@ void Hypothesis::CleanupArcList()
*/
const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize();
- bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
+ bool distinctNBest = staticData.GetDistinctNBest() || staticData.GetLatticeSamplesSize() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
// prune arc list only if there too many arcs
- NTH_ELEMENT4(m_arcList->begin()
- , m_arcList->begin() + nBestSize - 1
- , m_arcList->end()
- , CompareHypothesisTotalScore());
+ NTH_ELEMENT4(m_arcList->begin()
+ , m_arcList->begin() + nBestSize - 1
+ , m_arcList->end()
+ , CompareHypothesisTotalScore());
// delete bad ones
ArcList::iterator iter;
diff --git a/moses/Hypothesis.h b/moses/Hypothesis.h
index 2c49a8ea2..2b0c98d91 100644
--- a/moses/Hypothesis.h
+++ b/moses/Hypothesis.h
@@ -25,6 +25,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <iostream>
#include <memory>
+
+#include <boost/scoped_ptr.hpp>
+
#include <vector>
#include "Phrase.h"
#include "TypeDef.h"
@@ -77,7 +80,9 @@ protected:
bool m_wordDeleted;
float m_totalScore; /*! score so far */
float m_futureScore; /*! estimated future cost to translate rest of sentence */
- ScoreComponentCollection m_scoreBreakdown; /*! scores for this hypothesis */
+ /*! sum of scores of this hypothesis, and previous hypotheses. Lazily initialised. */
+ mutable boost::scoped_ptr<ScoreComponentCollection> m_scoreBreakdown;
+ ScoreComponentCollection m_currScoreBreakdown; /*! scores for this hypothesis only */
std::vector<const FFState*> m_ffStates;
const Hypothesis *m_winningHypo;
ArcList *m_arcList; /*! all arcs that end at the same trellis point as this hypothesis */
@@ -137,7 +142,7 @@ public:
return m_currTargetWordsRange.GetNumWordsCovered();
}
- void Evaluate(const SquareMatrix &futureScore);
+ void EvaluateWhenApplied(const SquareMatrix &futureScore);
int GetId()const {
return m_id;
@@ -228,7 +233,14 @@ public:
return m_arcList;
}
const ScoreComponentCollection& GetScoreBreakdown() const {
- return m_scoreBreakdown;
+ if (!m_scoreBreakdown.get()) {
+ m_scoreBreakdown.reset(new ScoreComponentCollection());
+ m_scoreBreakdown->PlusEquals(m_currScoreBreakdown);
+ if (m_prevHypo) {
+ m_scoreBreakdown->PlusEquals(m_prevHypo->GetScoreBreakdown());
+ }
+ }
+ return *(m_scoreBreakdown.get());
}
float GetTotalScore() const {
return m_totalScore;
@@ -244,8 +256,8 @@ public:
}
// Added by oliver.wilson@ed.ac.uk for async lm stuff.
- void EvaluateWith(const StatefulFeatureFunction &sfff, int state_idx);
- void EvaluateWith(const StatelessFeatureFunction &slff);
+ void EvaluateWhenApplied(const StatefulFeatureFunction &sfff, int state_idx);
+ void EvaluateWhenApplied(const StatelessFeatureFunction &slff);
//! target span that trans opt would populate if applied to this hypo. Used for alignment check
size_t GetNextStartPos(const TranslationOption &transOpt) const;
diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp
new file mode 100644
index 000000000..ee7b12db5
--- /dev/null
+++ b/moses/IOWrapper.cpp
@@ -0,0 +1,909 @@
+// $Id$
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2006 University of Edinburgh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of the University of Edinburgh nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ***********************************************************************/
+
+// example file on how to use moses library
+
+#include <iostream>
+#include <stack>
+#include <boost/algorithm/string.hpp>
+
+#include "moses/Syntax/KBestExtractor.h"
+#include "moses/Syntax/SHyperedge.h"
+#include "moses/Syntax/S2T/DerivationWriter.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SVertex.h"
+
+#include "moses/TypeDef.h"
+#include "moses/Util.h"
+#include "moses/Hypothesis.h"
+#include "moses/WordsRange.h"
+#include "moses/TrellisPathList.h"
+#include "moses/StaticData.h"
+#include "moses/FeatureVector.h"
+#include "moses/InputFileStream.h"
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/TreeInput.h"
+#include "moses/ConfusionNet.h"
+#include "moses/WordLattice.h"
+#include "moses/Incremental.h"
+#include "moses/ChartManager.h"
+
+
+#include "util/exception.hh"
+
+#include "IOWrapper.h"
+
+using namespace std;
+
+namespace Moses
+{
+
+IOWrapper::IOWrapper()
+ :m_nBestStream(NULL)
+
+ ,m_outputWordGraphStream(NULL)
+ ,m_outputSearchGraphStream(NULL)
+ ,m_detailedTranslationReportingStream(NULL)
+ ,m_unknownsStream(NULL)
+ ,m_alignmentInfoStream(NULL)
+ ,m_latticeSamplesStream(NULL)
+
+ ,m_singleBestOutputCollector(NULL)
+ ,m_nBestOutputCollector(NULL)
+ ,m_unknownsCollector(NULL)
+ ,m_alignmentInfoCollector(NULL)
+ ,m_searchGraphOutputCollector(NULL)
+ ,m_detailedTranslationCollector(NULL)
+ ,m_wordGraphCollector(NULL)
+ ,m_latticeSamplesCollector(NULL)
+ ,m_detailTreeFragmentsOutputCollector(NULL)
+
+ ,m_surpressSingleBestOutput(false)
+
+ ,spe_src(NULL)
+ ,spe_trg(NULL)
+ ,spe_aln(NULL)
+{
+ const StaticData &staticData = StaticData::Instance();
+
+ m_inputFactorOrder = &staticData.GetInputFactorOrder();
+ m_outputFactorOrder = &staticData.GetOutputFactorOrder();
+ m_inputFactorUsed = FactorMask(*m_inputFactorOrder);
+
+ size_t nBestSize = staticData.GetNBestSize();
+ string nBestFilePath = staticData.GetNBestFilePath();
+
+ staticData.GetParameter().SetParameter<string>(m_inputFilePath, "input-file", "");
+ if (m_inputFilePath.empty()) {
+ m_inputFile = NULL;
+ m_inputStream = &cin;
+ }
+ else {
+ VERBOSE(2,"IO from File" << endl);
+ m_inputFile = new InputFileStream(m_inputFilePath);
+ m_inputStream = m_inputFile;
+ }
+
+ if (nBestSize > 0) {
+ if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") {
+ m_nBestStream = &std::cout;
+ m_nBestOutputCollector = new Moses::OutputCollector(&std::cout);
+ m_surpressSingleBestOutput = true;
+ } else {
+ std::ofstream *file = new std::ofstream;
+ file->open(nBestFilePath.c_str());
+ m_nBestStream = file;
+
+ m_nBestOutputCollector = new Moses::OutputCollector(file);
+ //m_nBestOutputCollector->HoldOutputStream();
+ }
+ }
+
+ // search graph output
+ if (staticData.GetOutputSearchGraph()) {
+ string fileName;
+ if (staticData.GetOutputSearchGraphExtended()) {
+ staticData.GetParameter().SetParameter<string>(fileName, "output-search-graph-extended", "");
+ }
+ else {
+ staticData.GetParameter().SetParameter<string>(fileName, "output-search-graph", "");
+ }
+ std::ofstream *file = new std::ofstream;
+ m_outputSearchGraphStream = file;
+ file->open(fileName.c_str());
+ }
+
+ if (!staticData.GetOutputUnknownsFile().empty()) {
+ m_unknownsStream = new std::ofstream(staticData.GetOutputUnknownsFile().c_str());
+ m_unknownsCollector = new Moses::OutputCollector(m_unknownsStream);
+ UTIL_THROW_IF2(!m_unknownsStream->good(),
+ "File for unknowns words could not be opened: " <<
+ staticData.GetOutputUnknownsFile());
+ }
+
+ if (!staticData.GetAlignmentOutputFile().empty()) {
+ m_alignmentInfoStream = new std::ofstream(staticData.GetAlignmentOutputFile().c_str());
+ m_alignmentInfoCollector = new Moses::OutputCollector(m_alignmentInfoStream);
+ UTIL_THROW_IF2(!m_alignmentInfoStream->good(),
+ "File for alignment output could not be opened: " << staticData.GetAlignmentOutputFile());
+ }
+
+ if (staticData.GetOutputSearchGraph()) {
+ string fileName;
+ staticData.GetParameter().SetParameter<string>(fileName, "output-search-graph", "");
+
+ std::ofstream *file = new std::ofstream;
+ m_outputSearchGraphStream = file;
+ file->open(fileName.c_str());
+ m_searchGraphOutputCollector = new Moses::OutputCollector(m_outputSearchGraphStream);
+ }
+
+ // detailed translation reporting
+ if (staticData.IsDetailedTranslationReportingEnabled()) {
+ const std::string &path = staticData.GetDetailedTranslationReportingFilePath();
+ m_detailedTranslationReportingStream = new std::ofstream(path.c_str());
+ m_detailedTranslationCollector = new Moses::OutputCollector(m_detailedTranslationReportingStream);
+ }
+
+ if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
+ const std::string &path = staticData.GetDetailedTreeFragmentsTranslationReportingFilePath();
+ m_detailedTreeFragmentsTranslationReportingStream = new std::ofstream(path.c_str());
+ m_detailTreeFragmentsOutputCollector = new Moses::OutputCollector(m_detailedTreeFragmentsTranslationReportingStream);
+ }
+
+ // wordgraph output
+ if (staticData.GetOutputWordGraph()) {
+ string fileName;
+ staticData.GetParameter().SetParameter<string>(fileName, "output-word-graph", "");
+
+ std::ofstream *file = new std::ofstream;
+ m_outputWordGraphStream = file;
+ file->open(fileName.c_str());
+ m_wordGraphCollector = new OutputCollector(m_outputWordGraphStream);
+ }
+
+ size_t latticeSamplesSize = staticData.GetLatticeSamplesSize();
+ string latticeSamplesFile = staticData.GetLatticeSamplesFilePath();
+ if (latticeSamplesSize) {
+ if (latticeSamplesFile == "-" || latticeSamplesFile == "/dev/stdout") {
+ m_latticeSamplesCollector = new OutputCollector();
+ m_surpressSingleBestOutput = true;
+ } else {
+ m_latticeSamplesStream = new ofstream(latticeSamplesFile.c_str());
+ if (!m_latticeSamplesStream->good()) {
+ TRACE_ERR("ERROR: Failed to open " << latticeSamplesFile << " for lattice samples" << endl);
+ exit(1);
+ }
+ m_latticeSamplesCollector = new OutputCollector(m_latticeSamplesStream);
+ }
+ }
+
+ if (!m_surpressSingleBestOutput) {
+ m_singleBestOutputCollector = new Moses::OutputCollector(&std::cout);
+ }
+
+ if (staticData.GetParameter().GetParam("spe-src")) {
+ spe_src = new ifstream(staticData.GetParameter().GetParam("spe-src")->at(0).c_str());
+ spe_trg = new ifstream(staticData.GetParameter().GetParam("spe-trg")->at(0).c_str());
+ spe_aln = new ifstream(staticData.GetParameter().GetParam("spe-aln")->at(0).c_str());
+ }
+}
+
+IOWrapper::~IOWrapper()
+{
+ if (m_inputFile != NULL)
+ delete m_inputFile;
+ if (m_nBestStream != NULL && !m_surpressSingleBestOutput) {
+ // outputting n-best to file, rather than stdout. need to close file and delete obj
+ delete m_nBestStream;
+ }
+
+ delete m_detailedTranslationReportingStream;
+ delete m_alignmentInfoStream;
+ delete m_unknownsStream;
+ delete m_outputSearchGraphStream;
+ delete m_outputWordGraphStream;
+ delete m_latticeSamplesStream;
+
+ delete m_singleBestOutputCollector;
+ delete m_nBestOutputCollector;
+ delete m_alignmentInfoCollector;
+ delete m_searchGraphOutputCollector;
+ delete m_detailedTranslationCollector;
+ delete m_wordGraphCollector;
+ delete m_latticeSamplesCollector;
+ delete m_detailTreeFragmentsOutputCollector;
+
+}
+
+InputType*
+IOWrapper::
+GetInput(InputType* inputType)
+{
+ if(inputType->Read(*m_inputStream, *m_inputFactorOrder)) {
+ return inputType;
+ } else {
+ delete inputType;
+ return NULL;
+ }
+}
+
+std::map<size_t, const Factor*> IOWrapper::GetPlaceholders(const Hypothesis &hypo, FactorType placeholderFactor)
+{
+ const InputPath &inputPath = hypo.GetTranslationOption().GetInputPath();
+ const Phrase &inputPhrase = inputPath.GetPhrase();
+
+ std::map<size_t, const Factor*> ret;
+
+ for (size_t sourcePos = 0; sourcePos < inputPhrase.GetSize(); ++sourcePos) {
+ const Factor *factor = inputPhrase.GetFactor(sourcePos, placeholderFactor);
+ if (factor) {
+ std::set<size_t> targetPos = hypo.GetTranslationOption().GetTargetPhrase().GetAlignTerm().GetAlignmentsForSource(sourcePos);
+ UTIL_THROW_IF2(targetPos.size() != 1,
+ "Placeholder should be aligned to 1, and only 1, word");
+ ret[*targetPos.begin()] = factor;
+ }
+ }
+
+ return ret;
+}
+
+
+void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId)
+{
+ if (!m_singleBestOutputCollector)
+ return;
+ std::ostringstream out;
+ FixPrecision(out);
+ if (hypo != NULL) {
+ VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
+ VERBOSE(3,"Best path: ");
+ Backtrack(hypo);
+ VERBOSE(3,"0" << std::endl);
+
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << hypo->GetTotalScore() << " ";
+ }
+
+ if (StaticData::Instance().IsPathRecoveryEnabled()) {
+ out << "||| ";
+ }
+ Phrase outPhrase(ARRAY_SIZE_INCR);
+ hypo->GetOutputPhrase(outPhrase);
+
+ // delete 1st & last
+ UTIL_THROW_IF2(outPhrase.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+
+ outPhrase.RemoveWord(0);
+ outPhrase.RemoveWord(outPhrase.GetSize() - 1);
+
+ const std::vector<FactorType> outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
+ string output = outPhrase.GetStringRep(outputFactorOrder);
+ out << output << endl;
+ } else {
+ VERBOSE(1, "NO BEST TRANSLATION" << endl);
+
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << "0 ";
+ }
+
+ out << endl;
+ }
+ m_singleBestOutputCollector->Write(translationId, out.str());
+}
+
+void IOWrapper::OutputBestHypo(search::Applied applied, long translationId)
+{
+ if (!m_singleBestOutputCollector) return;
+ std::ostringstream out;
+ FixPrecision(out);
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << applied.GetScore() << ' ';
+ }
+ Phrase outPhrase;
+ Incremental::ToPhrase(applied, outPhrase);
+ // delete 1st & last
+ UTIL_THROW_IF2(outPhrase.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+ outPhrase.RemoveWord(0);
+ outPhrase.RemoveWord(outPhrase.GetSize() - 1);
+ out << outPhrase.GetStringRep(StaticData::Instance().GetOutputFactorOrder());
+ out << '\n';
+ m_singleBestOutputCollector->Write(translationId, out.str());
+
+ VERBOSE(1,"BEST TRANSLATION: " << outPhrase << "[total=" << applied.GetScore() << "]" << endl);
+}
+
+void IOWrapper::OutputBestNone(long translationId)
+{
+ if (!m_singleBestOutputCollector) return;
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ m_singleBestOutputCollector->Write(translationId, "0 \n");
+ } else {
+ m_singleBestOutputCollector->Write(translationId, "\n");
+ }
+}
+
+void IOWrapper::Backtrack(const ChartHypothesis *hypo)
+{
+ const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+
+ vector<const ChartHypothesis*>::const_iterator iter;
+ for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
+ const ChartHypothesis *prevHypo = *iter;
+
+ VERBOSE(3,prevHypo->GetId() << " <= ");
+ Backtrack(prevHypo);
+ }
+}
+
+
+void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
+{
+ if (hypo != NULL) {
+ OutputTranslationOption(out, applicationContext, hypo, sentence, translationId);
+ out << std::endl;
+ }
+
+ // recursive
+ const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+ std::vector<const ChartHypothesis*>::const_iterator iter;
+ for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
+ const ChartHypothesis *prevHypo = *iter;
+ OutputTranslationOptions(out, applicationContext, prevHypo, sentence, translationId);
+ }
+}
+
+
+void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Sentence &sentence, long translationId)
+{
+ if (applied != NULL) {
+ OutputTranslationOption(out, applicationContext, applied, sentence, translationId);
+ out << std::endl;
+ }
+
+ // recursive
+ const search::Applied *child = applied->Children();
+ for (size_t i = 0; i < applied->GetArity(); i++) {
+ OutputTranslationOptions(out, applicationContext, child++, sentence, translationId);
+ }
+}
+
+void IOWrapper::OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
+{
+ ReconstructApplicationContext(*hypo, sentence, applicationContext);
+ out << "Trans Opt " << translationId
+ << " " << hypo->GetCurrSourceRange()
+ << ": ";
+ WriteApplicationContext(out, applicationContext);
+ out << ": " << hypo->GetCurrTargetPhrase().GetTargetLHS()
+ << "->" << hypo->GetCurrTargetPhrase()
+ << " " << hypo->GetTotalScore() << hypo->GetScoreBreakdown();
+}
+
+void IOWrapper::OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Sentence &sentence, long translationId)
+{
+ ReconstructApplicationContext(applied, sentence, applicationContext);
+ const TargetPhrase &phrase = *static_cast<const TargetPhrase*>(applied->GetNote().vp);
+ out << "Trans Opt " << translationId
+ << " " << applied->GetRange()
+ << ": ";
+ WriteApplicationContext(out, applicationContext);
+ out << ": " << phrase.GetTargetLHS()
+ << "->" << phrase
+ << " " << applied->GetScore(); // << hypo->GetScoreBreakdown() TODO: missing in incremental search hypothesis
+}
+
+// Given a hypothesis and sentence, reconstructs the 'application context' --
+// the source RHS symbols of the SCFG rule that was applied, plus their spans.
+void IOWrapper::ReconstructApplicationContext(const ChartHypothesis &hypo,
+ const Sentence &sentence,
+ ApplicationContext &context)
+{
+ context.clear();
+ const std::vector<const ChartHypothesis*> &prevHypos = hypo.GetPrevHypos();
+ std::vector<const ChartHypothesis*>::const_iterator p = prevHypos.begin();
+ std::vector<const ChartHypothesis*>::const_iterator end = prevHypos.end();
+ const WordsRange &span = hypo.GetCurrSourceRange();
+ size_t i = span.GetStartPos();
+ while (i <= span.GetEndPos()) {
+ if (p == end || i < (*p)->GetCurrSourceRange().GetStartPos()) {
+ // Symbol is a terminal.
+ const Word &symbol = sentence.GetWord(i);
+ context.push_back(std::make_pair(symbol, WordsRange(i, i)));
+ ++i;
+ } else {
+ // Symbol is a non-terminal.
+ const Word &symbol = (*p)->GetTargetLHS();
+ const WordsRange &range = (*p)->GetCurrSourceRange();
+ context.push_back(std::make_pair(symbol, range));
+ i = range.GetEndPos()+1;
+ ++p;
+ }
+ }
+}
+
+// Given a hypothesis and sentence, reconstructs the 'application context' --
+// the source RHS symbols of the SCFG rule that was applied, plus their spans.
+void IOWrapper::ReconstructApplicationContext(const search::Applied *applied,
+ const Sentence &sentence,
+ ApplicationContext &context)
+{
+ context.clear();
+ const WordsRange &span = applied->GetRange();
+ const search::Applied *child = applied->Children();
+ size_t i = span.GetStartPos();
+ size_t j = 0;
+
+ while (i <= span.GetEndPos()) {
+ if (j == applied->GetArity() || i < child->GetRange().GetStartPos()) {
+ // Symbol is a terminal.
+ const Word &symbol = sentence.GetWord(i);
+ context.push_back(std::make_pair(symbol, WordsRange(i, i)));
+ ++i;
+ } else {
+ // Symbol is a non-terminal.
+ const Word &symbol = static_cast<const TargetPhrase*>(child->GetNote().vp)->GetTargetLHS();
+ const WordsRange &range = child->GetRange();
+ context.push_back(std::make_pair(symbol, range));
+ i = range.GetEndPos()+1;
+ ++child;
+ ++j;
+ }
+ }
+}
+
+// Emulates the old operator<<(ostream &, const DottedRule &) function. The
+// output format is a bit odd (reverse order and double spacing between symbols)
+// but there are scripts and tools that expect the output of -T to look like
+// that.
+void IOWrapper::WriteApplicationContext(std::ostream &out,
+ const ApplicationContext &context)
+{
+ assert(!context.empty());
+ ApplicationContext::const_reverse_iterator p = context.rbegin();
+ while (true) {
+ out << p->second << "=" << p->first << " ";
+ if (++p == context.rend()) {
+ break;
+ }
+ out << " ";
+ }
+}
+
+/***
+ * print surface factor only for the given phrase
+ */
+void IOWrapper::OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<FactorType> &outputFactorOrder, bool reportAllFactors)
+{
+ UTIL_THROW_IF2(outputFactorOrder.size() == 0,
+ "Cannot be empty phrase");
+ if (reportAllFactors == true) {
+ out << phrase;
+ } else {
+ size_t size = phrase.GetSize();
+ for (size_t pos = 0 ; pos < size ; pos++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
+ out << *factor;
+ UTIL_THROW_IF2(factor == NULL,
+ "Empty factor 0 at position " << pos);
+
+ for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
+ UTIL_THROW_IF2(factor == NULL,
+ "Empty factor " << i << " at position " << pos);
+
+ out << "|" << *factor;
+ }
+ out << " ";
+ }
+ }
+}
+
+
+
+
+//////////////////////////////////////////////////////////////////////////
+/***
+ * print surface factor only for the given phrase
+ */
+void IOWrapper::OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
+ char reportSegmentation, bool reportAllFactors)
+{
+ UTIL_THROW_IF2(outputFactorOrder.size() == 0,
+ "Must specific at least 1 output factor");
+ const TargetPhrase& phrase = edge.GetCurrTargetPhrase();
+ bool markUnknown = StaticData::Instance().GetMarkUnknown();
+ if (reportAllFactors == true) {
+ out << phrase;
+ } else {
+ FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
+
+ std::map<size_t, const Factor*> placeholders;
+ if (placeholderFactor != NOT_FOUND) {
+ // creates map of target position -> factor for placeholders
+ placeholders = GetPlaceholders(edge, placeholderFactor);
+ }
+
+ size_t size = phrase.GetSize();
+ for (size_t pos = 0 ; pos < size ; pos++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
+
+ if (placeholders.size()) {
+ // do placeholders
+ std::map<size_t, const Factor*>::const_iterator iter = placeholders.find(pos);
+ if (iter != placeholders.end()) {
+ factor = iter->second;
+ }
+ }
+
+ UTIL_THROW_IF2(factor == NULL,
+ "No factor 0 at position " << pos);
+
+ //preface surface form with UNK if marking unknowns
+ const Word &word = phrase.GetWord(pos);
+ if(markUnknown && word.IsOOV()) {
+ out << "UNK" << *factor;
+ } else {
+ out << *factor;
+ }
+
+ for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
+ UTIL_THROW_IF2(factor == NULL,
+ "No factor " << i << " at position " << pos);
+
+ out << "|" << *factor;
+ }
+ out << " ";
+ }
+ }
+
+ // trace ("report segmentation") option "-t" / "-tt"
+ if (reportSegmentation > 0 && phrase.GetSize() > 0) {
+ const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
+ const int sourceStart = sourceRange.GetStartPos();
+ const int sourceEnd = sourceRange.GetEndPos();
+ out << "|" << sourceStart << "-" << sourceEnd; // enriched "-tt"
+ if (reportSegmentation == 2) {
+ out << ",wa=";
+ const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
+ OutputAlignment(out, ai, 0, 0);
+ out << ",total=";
+ out << edge.GetScore() - edge.GetPrevHypo()->GetScore();
+ out << ",";
+ ScoreComponentCollection scoreBreakdown(edge.GetScoreBreakdown());
+ scoreBreakdown.MinusEquals(edge.GetPrevHypo()->GetScoreBreakdown());
+ OutputAllFeatureScores(scoreBreakdown, out);
+ }
+ out << "| ";
+ }
+}
+
+void IOWrapper::OutputBestSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder,
+ char reportSegmentation, bool reportAllFactors)
+{
+ if (hypo != NULL) {
+ // recursively retrace this best path through the lattice, starting from the end of the hypothesis sentence
+ OutputBestSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors);
+ OutputSurface(out, *hypo, outputFactorOrder, reportSegmentation, reportAllFactors);
+ }
+}
+
+void IOWrapper::OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset)
+{
+ typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
+ AlignVec alignments = ai.GetSortedAlignments();
+
+ AlignVec::const_iterator it;
+ for (it = alignments.begin(); it != alignments.end(); ++it) {
+ const std::pair<size_t,size_t> &alignment = **it;
+ out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
+ }
+
+}
+
+void IOWrapper::OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
+{
+ size_t targetOffset = 0;
+
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ const TargetPhrase &tp = edge.GetCurrTargetPhrase();
+ size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
+
+ OutputAlignment(out, tp.GetAlignTerm(), sourceOffset, targetOffset);
+
+ targetOffset += tp.GetSize();
+ }
+ // Removing std::endl here breaks -alignment-output-file, so stop doing that, please :)
+ // Or fix it somewhere else.
+ out << std::endl;
+}
+
+void IOWrapper::OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
+{
+ std::vector<const Hypothesis *> edges;
+ const Hypothesis *currentHypo = hypo;
+ while (currentHypo) {
+ edges.push_back(currentHypo);
+ currentHypo = currentHypo->GetPrevHypo();
+ }
+
+ OutputAlignment(out, edges);
+
+}
+
+void IOWrapper::OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
+{
+ ostringstream out;
+ OutputAlignment(out, edges);
+
+ collector->Write(lineNo,out.str());
+}
+
+void IOWrapper::OutputAlignment(OutputCollector* collector, size_t lineNo , const Hypothesis *hypo)
+{
+ if (collector) {
+ std::vector<const Hypothesis *> edges;
+ const Hypothesis *currentHypo = hypo;
+ while (currentHypo) {
+ edges.push_back(currentHypo);
+ currentHypo = currentHypo->GetPrevHypo();
+ }
+
+ OutputAlignment(collector,lineNo, edges);
+ }
+}
+
+void IOWrapper::OutputAlignment(OutputCollector* collector, size_t lineNo , const TrellisPath &path)
+{
+ if (collector) {
+ OutputAlignment(collector,lineNo, path.GetEdges());
+ }
+}
+
+void IOWrapper::OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/, char reportSegmentation, bool reportAllFactors, std::ostream &out)
+{
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
+
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ OutputSurface(out, edge, StaticData::Instance().GetOutputFactorOrder(), reportSegmentation, reportAllFactors);
+ }
+ out << endl;
+}
+
+void IOWrapper::Backtrack(const Hypothesis *hypo)
+{
+
+ if (hypo->GetPrevHypo() != NULL) {
+ VERBOSE(3,hypo->GetId() << " <= ");
+ Backtrack(hypo->GetPrevHypo());
+ }
+}
+
+void IOWrapper::OutputBestHypo(const std::vector<Word>& mbrBestHypo, long /*translationId*/, char /*reportSegmentation*/, bool /*reportAllFactors*/, ostream& out)
+{
+
+ for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
+ const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
+ UTIL_THROW_IF2(factor == NULL,
+ "No factor 0 at position " << i);
+ if (i>0) out << " " << *factor;
+ else out << *factor;
+ }
+ out << endl;
+}
+
+
+void IOWrapper::OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
+{
+ if (hypo->GetPrevHypo()) {
+ OutputInput(map, hypo->GetPrevHypo());
+ map[hypo->GetCurrSourceWordsRange().GetStartPos()] = &hypo->GetTranslationOption().GetInputPath().GetPhrase();
+ }
+}
+
+void IOWrapper::OutputInput(std::ostream& os, const Hypothesis* hypo)
+{
+ size_t len = hypo->GetInput().GetSize();
+ std::vector<const Phrase*> inp_phrases(len, 0);
+ OutputInput(inp_phrases, hypo);
+ for (size_t i=0; i<len; ++i)
+ if (inp_phrases[i]) os << *inp_phrases[i];
+}
+
+void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long /*translationId*/, char reportSegmentation, bool reportAllFactors)
+{
+ if (hypo != NULL) {
+ VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
+ VERBOSE(3,"Best path: ");
+ Backtrack(hypo);
+ VERBOSE(3,"0" << std::endl);
+ if (!m_surpressSingleBestOutput) {
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ cout << hypo->GetTotalScore() << " ";
+ }
+
+ if (StaticData::Instance().IsPathRecoveryEnabled()) {
+ OutputInput(cout, hypo);
+ cout << "||| ";
+ }
+ OutputBestSurface(cout, hypo, *m_outputFactorOrder, reportSegmentation, reportAllFactors);
+ cout << endl;
+ }
+ } else {
+ VERBOSE(1, "NO BEST TRANSLATION" << endl);
+ if (!m_surpressSingleBestOutput) {
+ cout << endl;
+ }
+ }
+}
+
+bool IOWrapper::ReadInput(InputTypeEnum inputType, InputType*& source)
+{
+ delete source;
+ switch(inputType) {
+ case SentenceInput:
+ source = GetInput(new Sentence);
+ break;
+ case ConfusionNetworkInput:
+ source = GetInput(new ConfusionNet);
+ break;
+ case WordLatticeInput:
+ source = GetInput(new WordLattice);
+ break;
+ case TreeInputType:
+ source = GetInput(new TreeInput);
+ break;
+ default:
+ TRACE_ERR("Unknown input type: " << inputType << "\n");
+ }
+ return (source ? true : false);
+}
+
+void IOWrapper::OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
+ , std::ostream &out)
+{
+ std::string lastName = "";
+ const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for( size_t i=0; i<sff.size(); i++ ) {
+ const StatefulFeatureFunction *ff = sff[i];
+ if (ff->GetScoreProducerDescription() != "BleuScoreFeature"
+ && ff->IsTuneable()) {
+ OutputFeatureScores( out, features, ff, lastName );
+ }
+ }
+ const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ for( size_t i=0; i<slf.size(); i++ ) {
+ const StatelessFeatureFunction *ff = slf[i];
+ if (ff->IsTuneable()) {
+ OutputFeatureScores( out, features, ff, lastName );
+ }
+ }
+}
+
+void IOWrapper::OutputFeatureScores( std::ostream& out
+ , const ScoreComponentCollection &features
+ , const FeatureFunction *ff
+ , std::string &lastName )
+{
+ const StaticData &staticData = StaticData::Instance();
+ bool labeledOutput = staticData.IsLabeledNBestList();
+
+ // regular features (not sparse)
+ if (ff->GetNumScoreComponents() != 0) {
+ if( labeledOutput && lastName != ff->GetScoreProducerDescription() ) {
+ lastName = ff->GetScoreProducerDescription();
+ out << " " << lastName << "=";
+ }
+ vector<float> scores = features.GetScoresForProducer( ff );
+ for (size_t j = 0; j<scores.size(); ++j) {
+ out << " " << scores[j];
+ }
+ }
+
+ // sparse features
+ const FVector scores = features.GetVectorForProducer( ff );
+ for(FVector::FNVmap::const_iterator i = scores.cbegin(); i != scores.cend(); i++) {
+ out << " " << i->first << "= " << i->second;
+ }
+}
+
+void IOWrapper::OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>& solutions,long translationId)
+{
+ for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
+ out << translationId;
+ out << " |||";
+ const vector<Word> mbrHypo = si->GetWords();
+ for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
+ const Factor *factor = mbrHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
+ if (i>0) out << " " << *factor;
+ else out << *factor;
+ }
+ out << " |||";
+ out << " map: " << si->GetMapScore();
+ out << " w: " << mbrHypo.size();
+ const vector<float>& ngramScores = si->GetNgramScores();
+ for (size_t i = 0; i < ngramScores.size(); ++i) {
+ out << " " << ngramScores[i];
+ }
+ out << " ||| " << si->GetScore();
+
+ out << endl;
+ }
+}
+
+
+void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solutions,long translationId)
+{
+ OutputLatticeMBRNBest(*m_nBestStream, solutions,translationId);
+}
+
+////////////////////////////
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/S2T/DerivationWriter.h"
+
+void IOWrapper::OutputBestHypo(const Syntax::SHyperedge *best,
+ long translationId)
+{
+ if (!m_singleBestOutputCollector) {
+ return;
+ }
+ std::ostringstream out;
+ FixPrecision(out);
+ if (best == NULL) {
+ VERBOSE(1, "NO BEST TRANSLATION" << std::endl);
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << "0 ";
+ }
+ } else {
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << best->score << " ";
+ }
+ Phrase yield = Syntax::GetOneBestTargetYield(*best);
+ // delete 1st & last
+ UTIL_THROW_IF2(yield.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+ yield.RemoveWord(0);
+ yield.RemoveWord(yield.GetSize()-1);
+ out << yield.GetStringRep(StaticData::Instance().GetOutputFactorOrder());
+ out << '\n';
+ }
+ m_singleBestOutputCollector->Write(translationId, out.str());
+}
+
+} // namespace
+
diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h
new file mode 100644
index 000000000..19506a644
--- /dev/null
+++ b/moses/IOWrapper.h
@@ -0,0 +1,218 @@
+// $Id$
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2006 University of Edinburgh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of the University of Edinburgh nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+// example file on how to use moses library
+
+#pragma once
+
+#include <cassert>
+#include <fstream>
+#include <ostream>
+#include <vector>
+
+#include "moses/TypeDef.h"
+#include "moses/Sentence.h"
+#include "moses/FactorTypeSet.h"
+#include "moses/FactorCollection.h"
+#include "moses/Hypothesis.h"
+#include "moses/OutputCollector.h"
+#include "moses/TrellisPathList.h"
+#include "moses/InputFileStream.h"
+#include "moses/InputType.h"
+#include "moses/WordLattice.h"
+#include "moses/LatticeMBR.h"
+#include "moses/ChartKBestExtractor.h"
+#include "moses/Syntax/KBestExtractor.h"
+
+#include "search/applied.hh"
+
+namespace Moses
+{
+class ScoreComponentCollection;
+class Hypothesis;
+class ChartHypothesis;
+class Factor;
+
+namespace Syntax
+{
+struct SHyperedge;
+}
+
+/** Helper class that holds misc variables to write data out to command line.
+ */
+class IOWrapper
+{
+protected:
+
+ const std::vector<Moses::FactorType> *m_inputFactorOrder;
+ const std::vector<Moses::FactorType> *m_outputFactorOrder;
+ Moses::FactorMask m_inputFactorUsed;
+ std::string m_inputFilePath;
+ Moses::InputFileStream *m_inputFile;
+ std::istream *m_inputStream;
+ std::ostream *m_nBestStream;
+ std::ostream *m_outputWordGraphStream;
+ std::ostream *m_outputSearchGraphStream;
+ std::ostream *m_detailedTranslationReportingStream;
+ std::ostream *m_unknownsStream;
+ std::ostream *m_detailedTreeFragmentsTranslationReportingStream;
+ std::ofstream *m_alignmentInfoStream;
+ std::ofstream *m_latticeSamplesStream;
+
+ Moses::OutputCollector *m_singleBestOutputCollector;
+ Moses::OutputCollector *m_nBestOutputCollector;
+ Moses::OutputCollector *m_unknownsCollector;
+ Moses::OutputCollector *m_alignmentInfoCollector;
+ Moses::OutputCollector *m_searchGraphOutputCollector;
+ Moses::OutputCollector *m_detailedTranslationCollector;
+ Moses::OutputCollector *m_wordGraphCollector;
+ Moses::OutputCollector *m_latticeSamplesCollector;
+ Moses::OutputCollector *m_detailTreeFragmentsOutputCollector;
+
+ bool m_surpressSingleBestOutput;
+
+ // CHART
+ typedef std::vector<std::pair<Moses::Word, Moses::WordsRange> > ApplicationContext;
+
+ void Backtrack(const ChartHypothesis *hypo);
+ void OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
+ void OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
+ void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
+ void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
+
+ void ReconstructApplicationContext(const Moses::ChartHypothesis &hypo,
+ const Moses::Sentence &sentence,
+ ApplicationContext &context);
+ void ReconstructApplicationContext(const search::Applied *applied,
+ const Moses::Sentence &sentence,
+ ApplicationContext &context);
+ void WriteApplicationContext(std::ostream &out,
+ const ApplicationContext &context);
+
+ void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<FactorType> &outputFactorOrder, bool reportAllFactors);
+ void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
+ char reportSegmentation, bool reportAllFactors);
+
+public:
+ IOWrapper();
+ ~IOWrapper();
+
+ Moses::InputType* GetInput(Moses::InputType *inputType);
+ bool ReadInput(Moses::InputTypeEnum inputType, Moses::InputType*& source);
+
+ void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, char reportSegmentation, bool reportAllFactors);
+ void OutputLatticeMBRNBestList(const std::vector<LatticeMBRSolution>& solutions,long translationId);
+ void Backtrack(const Moses::Hypothesis *hypo);
+
+ Moses::OutputCollector *GetSingleBestOutputCollector() {
+ return m_singleBestOutputCollector;
+ }
+
+ Moses::OutputCollector *GetNBestOutputCollector() {
+ return m_nBestOutputCollector;
+ }
+
+ Moses::OutputCollector *GetUnknownsCollector() {
+ return m_unknownsCollector;
+ }
+
+ Moses::OutputCollector *GetAlignmentInfoCollector() {
+ return m_alignmentInfoCollector;
+ }
+
+ Moses::OutputCollector *GetSearchGraphOutputCollector() {
+ return m_searchGraphOutputCollector;
+ }
+
+ Moses::OutputCollector *GetDetailedTranslationCollector() {
+ return m_detailedTranslationCollector;
+ }
+
+ Moses::OutputCollector *GetWordGraphCollector() {
+ return m_wordGraphCollector;
+ }
+
+ Moses::OutputCollector *GetLatticeSamplesCollector() {
+ return m_latticeSamplesCollector;
+ }
+
+ Moses::OutputCollector *GetDetailTreeFragmentsOutputCollector() {
+ return m_detailTreeFragmentsOutputCollector;
+ }
+
+
+ // CHART
+ void OutputBestHypo(const Moses::ChartHypothesis *hypo, long translationId);
+ void OutputBestHypo(search::Applied applied, long translationId);
+ void OutputBestHypo(const Moses::Syntax::SHyperedge *, long translationId);
+
+ void OutputBestNone(long translationId);
+
+ // phrase-based
+ void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder, char reportSegmentation, bool reportAllFactors);
+ void OutputLatticeMBRNBest(std::ostream& out, const std::vector<LatticeMBRSolution>& solutions,long translationId);
+ void OutputBestHypo(const std::vector<Moses::Word>& mbrBestHypo, long /*translationId*/,
+ char reportSegmentation, bool reportAllFactors, std::ostream& out);
+ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,char reportSegmentation, bool reportAllFactors, std::ostream &out);
+ void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
+ void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo);
+
+ void OutputPassthroughInformation(std::string& passthrough, const Moses::Hypothesis* hypo);
+ void OutputPassthroughInformation(std::ostream& os, const Moses::Hypothesis* hypo);
+
+ void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
+ void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
+ void OutputAlignment(OutputCollector* collector, size_t lineNo , const std::vector<const Hypothesis *> &edges);
+
+ static void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
+ static void OutputAlignment(std::ostream &out, const std::vector<const Hypothesis *> &edges);
+ static void OutputAlignment(std::ostream &out, const Moses::AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset);
+
+ static void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
+ , std::ostream &out);
+ static void OutputFeatureScores( std::ostream& out
+ , const Moses::ScoreComponentCollection &features
+ , const Moses::FeatureFunction *ff
+ , std::string &lastName );
+
+ // creates a map of TARGET positions which should be replaced by word using placeholder
+ std::map<size_t, const Moses::Factor*> GetPlaceholders(const Moses::Hypothesis &hypo, Moses::FactorType placeholderFactor);
+
+ // post editing
+ std::ifstream *spe_src, *spe_trg, *spe_aln;
+
+};
+
+
+
+}
+
diff --git a/moses/Incremental.cpp b/moses/Incremental.cpp
index a01b49ae7..d5de7b4d7 100644
--- a/moses/Incremental.cpp
+++ b/moses/Incremental.cpp
@@ -8,6 +8,7 @@
#include "moses/StaticData.h"
#include "moses/Util.h"
#include "moses/LM/Base.h"
+#include "moses/OutputCollector.h"
#include "lm/model.hh"
#include "search/applied.hh"
@@ -102,7 +103,7 @@ public:
return vertex.BestChild();
}
- void Evaluate(const InputType &input, const InputPath &inputPath) {
+ void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath) {
// TODO for input lattice
}
private:
@@ -203,7 +204,7 @@ struct ChartCellBaseFactory {
} // namespace
Manager::Manager(const InputType &source) :
- source_(source),
+ BaseManager(source),
cells_(source, ChartCellBaseFactory()),
parser_(source, cells_),
n_best_(search::NBestConfig(StaticData::Instance().GetNBestSize())) {}
@@ -220,7 +221,7 @@ template <class Model, class Best> search::History Manager::PopulateBest(const M
search::Config config(abstract.GetWeight() * M_LN10, data.GetCubePruningPopLimit(), search::NBestConfig(data.GetNBestSize()));
search::Context<Model> context(config, model);
- size_t size = source_.GetSize();
+ size_t size = m_source.GetSize();
boost::object_pool<search::Vertex> vertex_pool(std::max<size_t>(size * size / 2, 32));
for (int startPos = size-1; startPos >= 0; --startPos) {
@@ -272,12 +273,198 @@ template void Manager::LMCallback<lm::ngram::QuantTrieModel>(const lm::ngram::Qu
template void Manager::LMCallback<lm::ngram::ArrayTrieModel>(const lm::ngram::ArrayTrieModel &model, const std::vector<lm::WordIndex> &words);
template void Manager::LMCallback<lm::ngram::QuantArrayTrieModel>(const lm::ngram::QuantArrayTrieModel &model, const std::vector<lm::WordIndex> &words);
-const std::vector<search::Applied> &Manager::ProcessSentence()
+void Manager::Decode()
{
LanguageModel::GetFirstLM().IncrementalCallback(*this);
+}
+
+const std::vector<search::Applied> &Manager::GetNBest() const
+{
return *completed_nbest_;
}
+void Manager::OutputNBest(OutputCollector *collector) const
+{
+ if (collector == NULL) {
+ return;
+ }
+
+ OutputNBestList(collector, *completed_nbest_, m_source.GetTranslationId());
+}
+
+void Manager::OutputNBestList(OutputCollector *collector, const std::vector<search::Applied> &nbest, long translationId) const
+{
+ const StaticData &staticData = StaticData::Instance();
+ const std::vector<Moses::FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder();
+
+ std::ostringstream out;
+ // wtf? copied from the original OutputNBestList
+ if (collector->OutputIsCout()) {
+ FixPrecision(out);
+ }
+ Phrase outputPhrase;
+ ScoreComponentCollection features;
+ for (std::vector<search::Applied>::const_iterator i = nbest.begin(); i != nbest.end(); ++i) {
+ Incremental::PhraseAndFeatures(*i, outputPhrase, features);
+ // <s> and </s>
+ UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+
+ outputPhrase.RemoveWord(0);
+ outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
+ out << translationId << " ||| ";
+ OutputSurface(out, outputPhrase, outputFactorOrder, false);
+ out << " ||| ";
+ OutputAllFeatureScores(features, out);
+ out << " ||| " << i->GetScore() << '\n';
+ }
+ out << std::flush;
+ assert(collector);
+ collector->Write(translationId, out.str());
+}
+
+void Manager::OutputDetailedTranslationReport(OutputCollector *collector) const
+{
+ if (collector && !completed_nbest_->empty()) {
+ const search::Applied &applied = completed_nbest_->at(0);
+ OutputDetailedTranslationReport(collector,
+ &applied,
+ static_cast<const Sentence&>(m_source),
+ m_source.GetTranslationId());
+ }
+
+}
+
+void Manager::OutputDetailedTranslationReport(
+ OutputCollector *collector,
+ const search::Applied *applied,
+ const Sentence &sentence,
+ long translationId) const
+{
+ if (applied == NULL) {
+ return;
+ }
+ std::ostringstream out;
+ ApplicationContext applicationContext;
+
+ OutputTranslationOptions(out, applicationContext, applied, sentence, translationId);
+ collector->Write(translationId, out.str());
+}
+
+void Manager::OutputTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const search::Applied *applied,
+ const Sentence &sentence, long translationId) const
+{
+ if (applied != NULL) {
+ OutputTranslationOption(out, applicationContext, applied, sentence, translationId);
+ out << std::endl;
+ }
+
+ // recursive
+ const search::Applied *child = applied->Children();
+ for (size_t i = 0; i < applied->GetArity(); i++) {
+ OutputTranslationOptions(out, applicationContext, child++, sentence, translationId);
+ }
+}
+
+void Manager::OutputTranslationOption(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const search::Applied *applied,
+ const Sentence &sentence,
+ long translationId) const
+{
+ ReconstructApplicationContext(applied, sentence, applicationContext);
+ const TargetPhrase &phrase = *static_cast<const TargetPhrase*>(applied->GetNote().vp);
+ out << "Trans Opt " << translationId
+ << " " << applied->GetRange()
+ << ": ";
+ WriteApplicationContext(out, applicationContext);
+ out << ": " << phrase.GetTargetLHS()
+ << "->" << phrase
+ << " " << applied->GetScore(); // << hypo->GetScoreBreakdown() TODO: missing in incremental search hypothesis
+}
+
+// Given a hypothesis and sentence, reconstructs the 'application context' --
+// the source RHS symbols of the SCFG rule that was applied, plus their spans.
+void Manager::ReconstructApplicationContext(const search::Applied *applied,
+ const Sentence &sentence,
+ ApplicationContext &context) const
+{
+ context.clear();
+ const WordsRange &span = applied->GetRange();
+ const search::Applied *child = applied->Children();
+ size_t i = span.GetStartPos();
+ size_t j = 0;
+
+ while (i <= span.GetEndPos()) {
+ if (j == applied->GetArity() || i < child->GetRange().GetStartPos()) {
+ // Symbol is a terminal.
+ const Word &symbol = sentence.GetWord(i);
+ context.push_back(std::make_pair(symbol, WordsRange(i, i)));
+ ++i;
+ } else {
+ // Symbol is a non-terminal.
+ const Word &symbol = static_cast<const TargetPhrase*>(child->GetNote().vp)->GetTargetLHS();
+ const WordsRange &range = child->GetRange();
+ context.push_back(std::make_pair(symbol, range));
+ i = range.GetEndPos()+1;
+ ++child;
+ ++j;
+ }
+ }
+}
+
+void Manager::OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const
+{
+ if (collector == NULL || Completed().empty()) {
+ return;
+ }
+
+ const search::Applied *applied = &Completed()[0];
+ const Sentence &sentence = dynamic_cast<const Sentence &>(m_source);
+ const size_t translationId = m_source.GetTranslationId();
+
+ std::ostringstream out;
+ ApplicationContext applicationContext;
+
+ OutputTreeFragmentsTranslationOptions(out, applicationContext, applied, sentence, translationId);
+
+ //Tree of full sentence
+ //TODO: incremental search doesn't support stateful features
+
+ collector->Write(translationId, out.str());
+
+}
+
+void Manager::OutputTreeFragmentsTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const search::Applied *applied,
+ const Sentence &sentence,
+ long translationId) const
+{
+
+ if (applied != NULL) {
+ OutputTranslationOption(out, applicationContext, applied, sentence, translationId);
+
+ const TargetPhrase &currTarPhr = *static_cast<const TargetPhrase*>(applied->GetNote().vp);
+
+ out << " ||| ";
+ if (const PhraseProperty *property = currTarPhr.GetProperty("Tree")) {
+ out << " " << *property->GetValueString();
+ } else {
+ out << " " << "noTreeInfo";
+ }
+ out << std::endl;
+ }
+
+ // recursive
+ const search::Applied *child = applied->Children();
+ for (size_t i = 0; i < applied->GetArity(); i++) {
+ OutputTreeFragmentsTranslationOptions(out, applicationContext, child++, sentence, translationId);
+ }
+}
+
namespace
{
@@ -327,7 +514,7 @@ void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreCompone
const LanguageModel &model = LanguageModel::GetFirstLM();
model.CalcScore(phrase, full, ignored_ngram, ignored_oov);
- // CalcScore transforms, but EvaluateChart doesn't.
+ // CalcScore transforms, but EvaluateWhenApplied doesn't.
features.Assign(&model, full);
}
diff --git a/moses/Incremental.h b/moses/Incremental.h
index 20040bf45..1115884ee 100644
--- a/moses/Incremental.h
+++ b/moses/Incremental.h
@@ -7,6 +7,8 @@
#include "moses/ChartCellCollection.h"
#include "moses/ChartParser.h"
+#include "BaseManager.h"
+
#include <vector>
#include <string>
@@ -19,7 +21,7 @@ class LanguageModel;
namespace Incremental
{
-class Manager
+class Manager : public BaseManager
{
public:
Manager(const InputType &source);
@@ -28,17 +30,37 @@ public:
template <class Model> void LMCallback(const Model &model, const std::vector<lm::WordIndex> &words);
- const std::vector<search::Applied> &ProcessSentence();
+ void Decode();
+
+ const std::vector<search::Applied> &GetNBest() const;
// Call to get the same value as ProcessSentence returned.
const std::vector<search::Applied> &Completed() const {
return *completed_nbest_;
}
+ // output
+ void OutputNBest(OutputCollector *collector) const;
+ void OutputDetailedTranslationReport(OutputCollector *collector) const;
+ void OutputNBestList(OutputCollector *collector, const std::vector<search::Applied> &nbest, long translationId) const;
+ void OutputLatticeSamples(OutputCollector *collector) const
+ {}
+ void OutputAlignment(OutputCollector *collector) const
+ {}
+ void OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const;
+ void OutputWordGraph(OutputCollector *collector) const
+ {}
+ void OutputSearchGraph(OutputCollector *collector) const
+ {}
+ void OutputSearchGraphSLF() const
+ {}
+ void OutputSearchGraphHypergraph() const
+ {}
+
+
private:
template <class Model, class Best> search::History PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out);
- const InputType &source_;
ChartCellCollectionBase cells_;
ChartParser parser_;
@@ -51,6 +73,32 @@ private:
search::NBest n_best_;
const std::vector<search::Applied> *completed_nbest_;
+
+ // outputs
+ void OutputDetailedTranslationReport(
+ OutputCollector *collector,
+ const search::Applied *applied,
+ const Sentence &sentence,
+ long translationId) const;
+ void OutputTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const search::Applied *applied,
+ const Sentence &sentence,
+ long translationId) const;
+ void OutputTranslationOption(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const search::Applied *applied,
+ const Sentence &sentence,
+ long translationId) const;
+ void ReconstructApplicationContext(const search::Applied *applied,
+ const Sentence &sentence,
+ ApplicationContext &context) const;
+ void OutputTreeFragmentsTranslationOptions(std::ostream &out,
+ ApplicationContext &applicationContext,
+ const search::Applied *applied,
+ const Sentence &sentence,
+ long translationId) const;
+
};
// Just get the phrase.
diff --git a/moses/InputPath.cpp b/moses/InputPath.cpp
index 4d840da4d..3800bcb1b 100644
--- a/moses/InputPath.cpp
+++ b/moses/InputPath.cpp
@@ -85,8 +85,8 @@ size_t InputPath::GetTotalRuleSize() const
size_t ret = 0;
std::map<const PhraseDictionary*, std::pair<const TargetPhraseCollection*, const void*> >::const_iterator iter;
for (iter = m_targetPhrases.begin(); iter != m_targetPhrases.end(); ++iter) {
- const PhraseDictionary *pt = iter->first;
- const TargetPhraseCollection *tpColl = iter->second.first;
+ // const PhraseDictionary *pt = iter->first;
+ const TargetPhraseCollection *tpColl = iter->second.first;
if (tpColl) {
ret += tpColl->GetSize();
@@ -100,7 +100,6 @@ std::ostream& operator<<(std::ostream& out, const InputPath& obj)
{
out << &obj << " " << obj.GetWordsRange() << " " << obj.GetPrevPath() << " " << obj.GetPhrase();
- out << "pt: ";
std::map<const PhraseDictionary*, std::pair<const TargetPhraseCollection*, const void*> >::const_iterator iter;
for (iter = obj.m_targetPhrases.begin(); iter != obj.m_targetPhrases.end(); ++iter) {
const PhraseDictionary *pt = iter->first;
diff --git a/moses/Jamfile b/moses/Jamfile
index cc65f56ea..74d962d41 100644
--- a/moses/Jamfile
+++ b/moses/Jamfile
@@ -10,7 +10,14 @@ if $(with-dlib) {
dlib = ;
}
-alias headers : ../util//kenutil : : : $(max-factors) $(dlib) ;
+with-oxlm = [ option.get "with-oxlm" ] ;
+if $(with-oxlm) {
+ oxlm = <cxxflags>-std=c++0x <define>LM_OXLM <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
+} else {
+ oxlm = ;
+}
+
+alias headers : ../util//kenutil : : : $(max-factors) $(dlib) $(oxlm) ;
alias ThreadPool : ThreadPool.cpp ;
alias Util : Util.cpp Timer.cpp ;
@@ -56,25 +63,33 @@ if [ option.get "with-mm" : no : yes ] = yes
lib moses :
[ glob
*.cpp
+ Syntax/*.cpp
+ Syntax/S2T/*.cpp
+ Syntax/S2T/Parsers/*.cpp
+ Syntax/S2T/Parsers/RecursiveCYKPlusParser/*.cpp
+ Syntax/S2T/Parsers/Scope3Parser/*.cpp
TranslationModel/*.cpp
TranslationModel/fuzzy-match/*.cpp
TranslationModel/DynSAInclude/*.cpp
TranslationModel/RuleTable/*.cpp
TranslationModel/Scope3Parser/*.cpp
TranslationModel/CYKPlusParser/*.cpp
+ ../phrase-extract/extract-ghkm/PhraseOrientation.cpp
FF/*.cpp
+ FF/bilingual-lm/*.cpp
FF/OSM-Feature/*.cpp
FF/LexicalReordering/*.cpp
PP/*.cpp
: #exceptions
ThreadPool.cpp
SyntacticLanguageModel.cpp
- *Test.cpp Mock*.cpp
+ *Test.cpp Mock*.cpp FF/*Test.cpp
FF/Factory.cpp
]
-headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool
+headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT TranslationModel/ProbingPT//ProbingPT synlm ThreadPool
+
..//search ../util/double-conversion//double-conversion ..//z ../OnDiskPt//OnDiskPt
-$(TOP)//boost_iostreams mmlib
+$(TOP)//boost_filesystem $(TOP)//boost_iostreams mmlib
:
<threading>single:<source>../util//rt
;
@@ -84,5 +99,5 @@ alias headers-to-install : [ glob-tree *.h ] ;
import testing ;
-unit-test moses_test : [ glob *Test.cpp Mock*.cpp ] moses headers ..//z ../OnDiskPt//OnDiskPt ..//boost_unit_test_framework ;
+unit-test moses_test : [ glob *Test.cpp Mock*.cpp FF/*Test.cpp ] ..//boost_filesystem moses headers ..//z ../OnDiskPt//OnDiskPt ..//boost_unit_test_framework ;
diff --git a/moses/LM/Base.cpp b/moses/LM/Base.cpp
index f59b5e31b..db71119d5 100644
--- a/moses/LM/Base.cpp
+++ b/moses/LM/Base.cpp
@@ -69,7 +69,7 @@ void LanguageModel::ReportHistoryOrder(std::ostream &out,const Phrase &phrase) c
// out << "ReportHistoryOrder not implemented";
}
-void LanguageModel::Evaluate(const Phrase &source
+void LanguageModel::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/LM/Base.h b/moses/LM/Base.h
index abae5de24..2be19e5bd 100644
--- a/moses/LM/Base.h
+++ b/moses/LM/Base.h
@@ -87,11 +87,11 @@ public:
virtual void IncrementalCallback(Incremental::Manager &manager) const;
virtual void ReportHistoryOrder(std::ostream &out,const Phrase &phrase) const;
- virtual void Evaluate(const Phrase &source
+ virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
- void Evaluate(const InputType &input
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
diff --git a/moses/LM/BilingualLM.cpp b/moses/LM/BilingualLM.cpp
new file mode 100644
index 000000000..bc0e61a67
--- /dev/null
+++ b/moses/LM/BilingualLM.cpp
@@ -0,0 +1,470 @@
+#include <vector>
+#include "BilingualLM.h"
+#include "moses/ScoreComponentCollection.h"
+
+using namespace std;
+
+namespace Moses {
+
+int BilingualLMState::Compare(const FFState& other) const
+{
+ const BilingualLMState &otherState = static_cast<const BilingualLMState&>(other);
+
+ if (m_hash == otherState.m_hash)
+ return 0;
+ return (m_hash < otherState.m_hash) ? -1 : +1;
+}
+
+////////////////////////////////////////////////////////////////
+BilingualLM::BilingualLM(const std::string &line)
+ : StatefulFeatureFunction(1, line),
+ word_factortype(0) {
+ FactorCollection& factorFactory = FactorCollection::Instance(); //Factor Factory to use for BOS_ and EOS_
+ BOS_factor = factorFactory.AddFactor(BOS_);
+ BOS_word.SetFactor(0, BOS_factor);
+ EOS_factor = factorFactory.AddFactor(EOS_);
+ EOS_word.SetFactor(0, EOS_factor);
+
+}
+
+void BilingualLM::Load(){
+ ReadParameters();
+ loadModel();
+}
+
+//Populates words with amount words from the targetPhrase from the previous hypothesis where
+//words[0] is the last word of the previous hypothesis, words[1] is the second last etc...
+void BilingualLM::requestPrevTargetNgrams(
+ const Hypothesis &cur_hypo, int amount, std::vector<int> &words) const {
+ const Hypothesis * prev_hyp = cur_hypo.GetPrevHypo();
+ int found = 0;
+
+ while (prev_hyp && found != amount) {
+ const TargetPhrase& currTargetPhrase = prev_hyp->GetCurrTargetPhrase();
+ for (int i = currTargetPhrase.GetSize() - 1; i> -1; i--){
+ if (found != amount){
+ const Word& word = currTargetPhrase.GetWord(i);
+ words[found] = getNeuralLMId(word, false);
+ found++;
+ } else {
+ return; //We have gotten everything needed
+ }
+ }
+
+ prev_hyp = prev_hyp->GetPrevHypo();
+ }
+
+ int neuralLM_wordID = getNeuralLMId(BOS_word, false);
+ for (int i = found; i < amount; i++){
+ words[i] = neuralLM_wordID;
+ }
+}
+
+//Populates the words vector with target_ngrams sized that also contains the current word we are looking at.
+//(in effect target_ngrams + 1)
+void BilingualLM::getTargetWords(
+ const Hypothesis &cur_hypo,
+ const TargetPhrase &targetPhrase,
+ int current_word_index,
+ std::vector<int> &words) const {
+ //Check if we need to look at previous target phrases
+ int additional_needed = current_word_index - target_ngrams;
+ if (additional_needed < 0) {
+ additional_needed = -additional_needed;
+ std::vector<int> prev_words(additional_needed);
+ requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
+ for (int i = additional_needed - 1; i >= 0; i--) {
+ words.push_back(prev_words[i]);
+ }
+ }
+
+ if (words.size() > 0) {
+ //We have added some words from previous phrases
+ //Just add until we reach current_word_index
+ for (int i = 0; i <= current_word_index; i++) {
+ const Word& word = targetPhrase.GetWord(i);
+ words.push_back(getNeuralLMId(word, false));
+ }
+ } else {
+ //We haven't added any words, proceed as before
+ for (int i = current_word_index - target_ngrams; i <= current_word_index; i++){
+ const Word& word = targetPhrase.GetWord(i);
+ words.push_back(getNeuralLMId(word, false));
+ }
+ }
+}
+
+//Returns source words in the way NeuralLM expects them.
+
+size_t BilingualLM::selectMiddleAlignment(
+ const set<size_t>& alignment_links) const {
+
+ set<size_t>::iterator it = alignment_links.begin();
+ for (int i = 0; i < (alignment_links.size() - 1) / 2; ++i) {
+ ++it;
+ }
+
+ return *it;
+}
+
+void BilingualLM::getSourceWords(
+ const TargetPhrase &targetPhrase,
+ int targetWordIdx,
+ const Sentence &source_sent,
+ const WordsRange &sourceWordRange,
+ std::vector<int> &words) const {
+ //Get source context
+
+ //Get alignment for the word we require
+ const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();
+
+ // We are getting word alignment for targetPhrase.GetWord(i + target_ngrams -1) according to the paper.
+ // Find the closest target word with alignment links.
+ std::set<size_t> last_word_al;
+ for (int j = 0; j < targetPhrase.GetSize(); j++) {
+ // Find the nearest aligned word with preference for right.
+ if ((targetWordIdx + j) < targetPhrase.GetSize()){
+ last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx + j);
+ if (!last_word_al.empty()) {
+ break;
+ }
+ }
+
+ // We couldn't find word on the right, try to the left.
+ if ((targetWordIdx - j) >= 0) {
+ last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx - j);
+ if (!last_word_al.empty()) {
+ break;
+ }
+ }
+ }
+
+ //Assume we have gotten some alignment here. If we couldn't get an alignment from the above routine it means
+ //that none of the words in the target phrase aligned to any word in the source phrase
+
+ // Now we get the source words. First select middle alignment.
+ //It should never be the case the the word_al size would be zero, but several times this has happened because
+ //of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier.
+ UTIL_THROW_IF2(last_word_al.size() == 0,
+ "A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table.");
+ size_t source_center_index = selectMiddleAlignment(last_word_al);
+ // We have found the alignment. Now determine how much to shift by to get the actual source word index.
+ size_t phrase_start_pos = sourceWordRange.GetStartPos();
+ // Account for how far the current word is from the start of the phrase.
+ size_t source_word_mid_idx = phrase_start_pos + source_center_index;
+
+ appendSourceWordsToVector(source_sent, words, source_word_mid_idx);
+}
+
+size_t BilingualLM::getState(const Hypothesis& cur_hypo) const {
+ const TargetPhrase &targetPhrase = cur_hypo.GetCurrTargetPhrase();
+ size_t hashCode = 0;
+
+ // Check if we need to look at previous target phrases
+ int additional_needed = targetPhrase.GetSize() - target_ngrams;
+ if (additional_needed < 0) {
+ additional_needed = -additional_needed;
+ std::vector<int> prev_words(additional_needed);
+ requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
+ for (int i = additional_needed - 1; i >= 0; i--) {
+ boost::hash_combine(hashCode, prev_words[i]);
+ }
+
+ // Get the rest of the phrases needed
+ for (int i = 0; i < targetPhrase.GetSize(); i++) {
+ const Word& word = targetPhrase.GetWord(i);
+ int neuralLM_wordID = getNeuralLMId(word, false);
+ boost::hash_combine(hashCode, neuralLM_wordID);
+ }
+ } else {
+ // We just need the last target_ngrams from the current target phrase.
+ for (int i = targetPhrase.GetSize() - target_ngrams; i < targetPhrase.GetSize(); i++) {
+ const Word& word = targetPhrase.GetWord(i);
+ int neuralLM_wordID = getNeuralLMId(word, false);
+
+ boost::hash_combine(hashCode, neuralLM_wordID);
+ }
+ }
+
+ return hashCode;
+}
+
+void BilingualLM::EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const {}
+
+void BilingualLM::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
+{
+
+}
+
+
+FFState* BilingualLM::EvaluateWhenApplied(
+ const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const {
+ Manager& manager = cur_hypo.GetManager();
+ const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());
+
+ // Init vectors.
+ std::vector<int> source_words;
+ source_words.reserve(source_ngrams);
+ std::vector<int> target_words;
+ target_words.reserve(target_ngrams);
+
+ float value = 0;
+ const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();
+ const WordsRange& sourceWordRange = cur_hypo.GetCurrSourceWordsRange(); //Source words range to calculate offsets
+
+ // For each word in the current target phrase get its LM score.
+ for (int i = 0; i < currTargetPhrase.GetSize(); i++){
+ getSourceWords(
+ currTargetPhrase, i, source_sent, sourceWordRange, source_words);
+ getTargetWords(cur_hypo, currTargetPhrase, i, target_words);
+ value += Score(source_words, target_words);
+
+ // Clear the vectors.
+ source_words.clear();
+ target_words.clear();
+ }
+
+ size_t new_state = getState(cur_hypo);
+ accumulator->PlusEquals(this, value);
+
+ return new BilingualLMState(new_state);
+}
+
+void BilingualLM::getAllTargetIdsChart(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& wordIds) const {
+ const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();
+ int next_nonterminal_index = 0;
+
+ for (int i = 0; i < targetPhrase.GetSize(); i++){
+ if (targetPhrase.GetWord(i).IsNonTerminal()){ //Nonterminal get from prev state
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(next_nonterminal_index);
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int> prevWordIDs = prev_state->GetWordIdsVector();
+ for (std::vector<int>::const_iterator it = prevWordIDs.begin(); it!= prevWordIDs.end(); it++){
+ wordIds.push_back(*it);
+ }
+ next_nonterminal_index++;
+ } else {
+ wordIds.push_back(getNeuralLMId(targetPhrase.GetWord(i), false));
+ }
+ }
+}
+
+void BilingualLM::getAllAlignments(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& word_alignemnts) const {
+ const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();
+ int next_nonterminal_index = 0;
+ int nonterm_length = 0; //Account for the size of nonterminals when calculating the alignment.
+ int source_phrase_start_pos = cur_hypo.GetCurrSourceRange().GetStartPos();
+ int source_word_mid_idx; //The word alignment
+
+ //Get source sent
+ const ChartManager& manager = cur_hypo.GetManager();
+ const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());
+ const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();
+
+ for (int i = 0; i < targetPhrase.GetSize(); i++){
+ //Sometimes we have to traverse more than one target words because of
+ //unaligned words. This is O(n^2) in worst case, but usually closer to O(n)
+ if (targetPhrase.GetWord(i).IsNonTerminal()){
+ //If we have a non terminal we can get the alignments from the previous state
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(next_nonterminal_index);
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int> prevWordAls = prev_state->GetWordAlignmentVector();
+ nonterm_length += prevWordAls.size();
+ for (std::vector<int>::const_iterator it = prevWordAls.begin(); it!= prevWordAls.end(); it++){
+ word_alignemnts.push_back(*it);
+ }
+ next_nonterminal_index++;
+ } else {
+ std::set<size_t> word_al; //Keep word alignments
+ bool resolvedIndexis = false; //If we are aligning to an existing nonterm we don't need to calculate offsets
+ for (int j = 0; j < targetPhrase.GetSize(); j++){
+ //Try to get alignment from the current word and if it is unaligned,
+ //try from the first word to the right and then to the left
+ if ((i+j) < targetPhrase.GetSize()) {
+ if (targetPhrase.GetWord(i + j).IsNonTerminal()) {
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(next_nonterminal_index);
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int>& word_alignments = prev_state->GetWordAlignmentVector();
+ source_word_mid_idx = word_alignments.front(); // The first word on the right of our word
+ resolvedIndexis = true;
+ break;
+ }
+ word_al = alignments.GetAlignmentsForTarget(i + j);
+ if (!word_al.empty()) {
+ break;
+ }
+ }
+
+ if ((i - j) >= 0) {
+ if (targetPhrase.GetWord(i - j).IsNonTerminal()) {
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(next_nonterminal_index - 1); //We need to look at the nonterm on the left.
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int>& word_alignments = prev_state->GetWordAlignmentVector();
+ source_word_mid_idx = word_alignments.back(); // The first word on the left of our word
+ resolvedIndexis = true;
+ break;
+ }
+
+ word_al = alignments.GetAlignmentsForTarget(i - j);
+ if (!word_al.empty()) {
+ break;
+ }
+ }
+ }
+
+ if (!resolvedIndexis){
+ //It should never be the case the the word_al size would be zero, but several times this has happened because
+ //of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier.
+ UTIL_THROW_IF2(word_al.size() == 0,
+ "A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table.");
+ size_t source_center_index = selectMiddleAlignment(word_al);
+ // We have found the alignment. Now determine how much to shift by to get the actual source word index.
+ source_word_mid_idx = source_phrase_start_pos + (int)source_center_index + nonterm_length;
+ }
+ word_alignemnts.push_back(source_word_mid_idx);
+ }
+ }
+
+}
+
+size_t BilingualLM::getStateChart(std::vector<int>& neuralLMids) const {
+ size_t hashCode = 0;
+ for (int i = neuralLMids.size() - target_ngrams; i < neuralLMids.size(); i++){
+ int neuralLM_wordID;
+ if (i < 0) {
+ neuralLM_wordID = getNeuralLMId(BOS_word, false);
+ } else {
+ neuralLM_wordID = neuralLMids[i];
+ }
+ boost::hash_combine(hashCode, neuralLM_wordID);
+ }
+ return hashCode;
+}
+
+void BilingualLM::getTargetWordsChart(
+ std::vector<int>& neuralLMids,
+ int current_word_index,
+ std::vector<int>& words,
+ bool sentence_begin) const {
+
+ for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) {
+ if (i < 0) {
+ if (sentence_begin) {
+ words.push_back(getNeuralLMId(BOS_word, false));
+ } else {
+ words.push_back(getNeuralLMId(getNullWord(), false));
+ }
+ } else {
+ words.push_back(neuralLMids[i]);
+ }
+ }
+}
+
+void BilingualLM::appendSourceWordsToVector(const Sentence &source_sent, std::vector<int> &words, int source_word_mid_idx) const {
+ //Define begin and end indexes of the lookup. Cases for even and odd ngrams
+ //This can result in indexes which span larger than the length of the source phrase.
+ //In this case we just
+ int begin_idx;
+ int end_idx;
+
+ if (source_ngrams % 2 == 0) {
+ begin_idx = source_word_mid_idx - source_ngrams / 2 + 1;
+ end_idx = source_word_mid_idx + source_ngrams / 2;
+ } else {
+ begin_idx = source_word_mid_idx - (source_ngrams - 1) / 2;
+ end_idx = source_word_mid_idx + (source_ngrams - 1) / 2;
+ }
+
+ //Add words to vector
+ for (int j = begin_idx; j <= end_idx; j++) {
+ int neuralLM_wordID;
+ if (j < 0) {
+ neuralLM_wordID = getNeuralLMId(BOS_word, true);
+ } else if (j >= source_sent.GetSize()) {
+ neuralLM_wordID = getNeuralLMId(EOS_word, true);
+ } else {
+ const Word& word = source_sent.GetWord(j);
+ neuralLM_wordID = getNeuralLMId(word, true);
+ }
+ words.push_back(neuralLM_wordID);
+ }
+}
+
+FFState* BilingualLM::EvaluateWhenApplied(
+ const ChartHypothesis& cur_hypo,
+ int featureID, /* - used to index the state in the previous hypotheses */
+ ScoreComponentCollection* accumulator) const {
+ //Init vectors
+ std::vector<int> source_words;
+ source_words.reserve(source_ngrams);
+ std::vector<int> target_words;
+ target_words.reserve(target_ngrams);
+
+ float value = 0; //NeuralLM score
+ const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();
+
+ std::vector<int> neuralLMids; //Equivalent more or less to whole_phrase. Contains all word ids but not as expensive
+ std::vector<int> alignments;
+ //Estimate size and reserve vectors to avoid reallocation
+ int future_size = currTargetPhrase.GetNumTerminals();
+ for (int i =0; i<currTargetPhrase.GetNumNonTerminals(); i++){
+ const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(i); //We need to look at the nonterm on the left.
+ const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
+ const std::vector<int>& wordIds = prev_state->GetWordIdsVector();
+ future_size += wordIds.size();
+ }
+ neuralLMids.reserve(future_size);
+ neuralLMids.reserve(future_size);
+
+ getAllTargetIdsChart(cur_hypo, featureID, neuralLMids);
+ getAllAlignments(cur_hypo, featureID, alignments);
+
+ bool sentence_begin = false; //Check if this hypothesis' target words are located in the beginning of the sentence
+ if (neuralLMids[0] == getNeuralLMId(BOS_word, true)){
+ sentence_begin = true;
+ }
+
+ //Get source sentence
+ const ChartManager& manager = cur_hypo.GetManager();
+ const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());
+
+ for (int i = 0; i < neuralLMids.size(); i++) { //This loop should be bigger as non terminals expand
+
+ //We already have resolved the nonterminals, we are left with a simple loop.
+ appendSourceWordsToVector(source_sent, source_words, alignments[i]);
+ getTargetWordsChart(neuralLMids, i, target_words, sentence_begin);
+
+ value += Score(source_words, target_words); // Get the score
+
+ //Clear the vectors before the next iteration
+ source_words.clear();
+ target_words.clear();
+
+ }
+ size_t new_state = getStateChart(neuralLMids);
+
+ accumulator->Assign(this, value);
+
+ return new BilingualLMState(new_state, alignments, neuralLMids);
+}
+
+void BilingualLM::SetParameter(const std::string& key, const std::string& value) {
+ if (key == "filepath") {
+ m_filePath = value;
+ } else {
+ StatefulFeatureFunction::SetParameter(key, value);
+ }
+}
+
+} // namespace Moses
+
diff --git a/moses/LM/BilingualLM.h b/moses/LM/BilingualLM.h
new file mode 100644
index 000000000..9f7235956
--- /dev/null
+++ b/moses/LM/BilingualLM.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <string>
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/FF/FFState.h"
+#include <boost/thread/tss.hpp>
+#include "moses/Hypothesis.h"
+#include "moses/ChartHypothesis.h"
+#include "moses/InputPath.h"
+#include "moses/Manager.h"
+#include "moses/ChartManager.h"
+#include "moses/FactorCollection.h"
+
+namespace Moses
+{
+
+class BilingualLMState : public FFState
+{
+ size_t m_hash;
+ std::vector<int> word_alignments; //Carry the word alignments. For hierarchical
+ std::vector<int> neuralLM_ids; //Carry the neuralLMids of the previous target phrase to avoid calling GetWholePhrase. Hiero only.
+public:
+ BilingualLMState(size_t hash)
+ :m_hash(hash)
+ {}
+ BilingualLMState(size_t hash, std::vector<int>& word_alignments_vec, std::vector<int>& neural_ids)
+ :m_hash(hash)
+ , word_alignments(word_alignments_vec)
+ , neuralLM_ids(neural_ids)
+ {}
+
+ const std::vector<int>& GetWordAlignmentVector() const {
+ return word_alignments;
+ }
+
+ const std::vector<int>& GetWordIdsVector() const {
+ return neuralLM_ids;
+ }
+
+ int Compare(const FFState& other) const;
+};
+
+class BilingualLM : public StatefulFeatureFunction {
+ private:
+ virtual float Score(std::vector<int>& source_words, std::vector<int>& target_words) const = 0;
+
+ virtual int getNeuralLMId(const Word& word, bool is_source_word) const = 0;
+
+ virtual void loadModel() = 0;
+
+ virtual const Word& getNullWord() const = 0;
+
+ size_t selectMiddleAlignment(const std::set<size_t>& alignment_links) const;
+
+ void getSourceWords(
+ const TargetPhrase &targetPhrase,
+ int targetWordIdx,
+ const Sentence &source_sent,
+ const WordsRange &sourceWordRange,
+ std::vector<int> &words) const;
+
+ void appendSourceWordsToVector(const Sentence &source_sent, std::vector<int> &words, int source_word_mid_idx) const;
+
+ void getTargetWords(
+ const Hypothesis &cur_hypo,
+ const TargetPhrase &targetPhrase,
+ int current_word_index,
+ std::vector<int> &words) const;
+
+ size_t getState(const Hypothesis &cur_hypo) const;
+
+ void requestPrevTargetNgrams(const Hypothesis &cur_hypo, int amount, std::vector<int> &words) const;
+
+ //Chart decoder
+ void getTargetWordsChart(
+ std::vector<int>& neuralLMids,
+ int current_word_index,
+ std::vector<int>& words,
+ bool sentence_begin) const;
+
+ size_t getStateChart(std::vector<int>& neuralLMids) const;
+
+ //Get a vector of all target words IDs in the beginning of calculating NeuralLMids for the current phrase.
+ void getAllTargetIdsChart(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& wordIds) const;
+ //Get a vector of all alignments (mid_idx word)
+ void getAllAlignments(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& alignemnts) const;
+
+protected:
+ // big data (vocab, weights, cache) shared among threads
+ std::string m_filePath;
+ int target_ngrams;
+ int source_ngrams;
+
+ //NeuralLM lookup
+ FactorType word_factortype;
+ FactorType pos_factortype;
+ const Factor* BOS_factor;
+ const Factor* EOS_factor;
+ mutable Word BOS_word;
+ mutable Word EOS_word;
+
+public:
+ BilingualLM(const std::string &line);
+
+ bool IsUseable(const FactorMask &mask) const {
+ return true;
+ }
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const {
+ return new BilingualLMState(0);
+ }
+
+ void Load();
+
+ void EvaluateInIsolation(
+ const Phrase &source,
+ const TargetPhrase &targetPhrase,
+ ScoreComponentCollection &scoreBreakdown,
+ ScoreComponentCollection &estimatedFutureScore) const;
+
+ void EvaluateWithSourceContext(
+ const InputType &input,
+ const InputPath &inputPath,
+ const TargetPhrase &targetPhrase,
+ const StackVec *stackVec,
+ ScoreComponentCollection &scoreBreakdown,
+ ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+ FFState* EvaluateWhenApplied(
+ const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
+
+ FFState* EvaluateWhenApplied(
+ const ChartHypothesis& cur_hypo ,
+ int featureID, /* - used to index the state in the previous hypotheses */
+ ScoreComponentCollection* accumulator) const;
+
+ void SetParameter(const std::string& key, const std::string& value);
+};
+
+}
+
diff --git a/moses/LM/DALMWrapper.cpp b/moses/LM/DALMWrapper.cpp
index 01f32b104..68b3050de 100644
--- a/moses/LM/DALMWrapper.cpp
+++ b/moses/LM/DALMWrapper.cpp
@@ -16,25 +16,24 @@
using namespace std;
/////////////////////////
-void read_ini(const char *inifile, string &model, string &words, string &wordstxt)
-{
- ifstream ifs(inifile);
- string line;
-
- getline(ifs, line);
- while(ifs) {
- unsigned int pos = line.find("=");
- string key = line.substr(0, pos);
- string value = line.substr(pos+1, line.size()-pos);
- if(key=="MODEL") {
- model = value;
- } else if(key=="WORDS") {
- words = value;
- } else if(key=="WORDSTXT") {
- wordstxt = value;
- }
- getline(ifs, line);
- }
+void read_ini(const char *inifile, string &model, string &words, string &wordstxt){
+ ifstream ifs(inifile);
+ string line;
+
+ getline(ifs, line);
+ while(ifs){
+ unsigned int pos = line.find("=");
+ string key = line.substr(0, pos);
+ string value = line.substr(pos+1, line.size()-pos);
+ if(key=="MODEL"){
+ model = value;
+ }else if(key=="WORDS"){
+ words = value;
+ }else if(key=="WORDSTXT"){
+ wordstxt = value;
+ }
+ getline(ifs, line);
+ }
}
/////////////////////////
@@ -44,140 +43,140 @@ namespace Moses
class DALMState : public FFState
{
private:
- DALM::State state;
+ DALM::State state;
public:
- DALMState() {
- }
-
- DALMState(const DALMState &from) {
- state = from.state;
- }
-
- virtual ~DALMState() {
- }
-
- void reset(const DALMState &from) {
- state = from.state;
- }
-
- virtual int Compare(const FFState& other) const {
- const DALMState &o = static_cast<const DALMState &>(other);
- if(state.get_count() < o.state.get_count()) return -1;
- else if(state.get_count() > o.state.get_count()) return 1;
- else return state.compare(o.state);
- }
-
- DALM::State &get_state() {
- return state;
- }
-
- void refresh() {
- state.refresh();
- }
+ DALMState(){
+ }
+
+ DALMState(const DALMState &from){
+ state = from.state;
+ }
+
+ virtual ~DALMState(){
+ }
+
+ void reset(const DALMState &from){
+ state = from.state;
+ }
+
+ virtual int Compare(const FFState& other) const{
+ const DALMState &o = static_cast<const DALMState &>(other);
+ if(state.get_count() < o.state.get_count()) return -1;
+ else if(state.get_count() > o.state.get_count()) return 1;
+ else return state.compare(o.state);
+ }
+
+ DALM::State &get_state(){
+ return state;
+ }
+
+ void refresh(){
+ state.refresh();
+ }
};
class DALMChartState : public FFState
{
private:
- DALM::Fragment prefixFragments[DALM_MAX_ORDER-1];
- unsigned char prefixLength;
- DALM::State rightContext;
- bool isLarge;
- size_t hypoSize;
+ DALM::Fragment prefixFragments[DALM_MAX_ORDER-1];
+ unsigned char prefixLength;
+ DALM::State rightContext;
+ bool isLarge;
+ size_t hypoSize;
public:
- DALMChartState()
- : prefixLength(0),
- isLarge(false)
- {}
-
- /*
- DALMChartState(const DALMChartState &other)
- : prefixLength(other.prefixLength),
- rightContext(other.rightContext),
- isLarge(other.isLarge)
- {
- std::copy(
- other.prefixFragments,
- other.prefixFragments+other.prefixLength,
- prefixFragments
- );
- }
- */
-
- virtual ~DALMChartState() {
- }
-
- /*
- DALMChartState &operator=(const DALMChartState &other){
- prefixLength = other.prefixLength;
- std::copy(
- other.prefixFragments,
- other.prefixFragments+other.prefixLength,
- prefixFragments
- );
- rightContext = other.rightContext;
- isLarge=other.isLarge;
-
- return *this;
- }
- */
-
- inline unsigned char GetPrefixLength() const {
- return prefixLength;
- }
-
- inline unsigned char &GetPrefixLength() {
- return prefixLength;
- }
-
- inline const DALM::Fragment *GetPrefixFragments() const {
- return prefixFragments;
- }
-
- inline DALM::Fragment *GetPrefixFragments() {
- return prefixFragments;
- }
-
- inline const DALM::State &GetRightContext() const {
- return rightContext;
- }
-
- inline DALM::State &GetRightContext() {
- return rightContext;
- }
-
- inline bool LargeEnough() const {
- return isLarge;
- }
-
- inline void SetAsLarge() {
- isLarge=true;
- }
-
- inline size_t &GetHypoSize() {
- return hypoSize;
- }
- inline size_t GetHypoSize() const {
- return hypoSize;
- }
-
- virtual int Compare(const FFState& other) const {
- const DALMChartState &o = static_cast<const DALMChartState &>(other);
- if(prefixLength < o.prefixLength) return -1;
- if(prefixLength > o.prefixLength) return 1;
- if(prefixLength!=0) {
- const DALM::Fragment &f = prefixFragments[prefixLength-1];
- const DALM::Fragment &of = o.prefixFragments[prefixLength-1];
- int ret = DALM::compare_fragments(f,of);
- if(ret != 0) return ret;
- }
- if(isLarge != o.isLarge) return (int)isLarge - (int)o.isLarge;
- if(rightContext.get_count() < o.rightContext.get_count()) return -1;
- if(rightContext.get_count() > o.rightContext.get_count()) return 1;
- return rightContext.compare(o.rightContext);
- }
+ DALMChartState()
+ : prefixLength(0),
+ isLarge(false)
+ {}
+
+ /*
+ DALMChartState(const DALMChartState &other)
+ : prefixLength(other.prefixLength),
+ rightContext(other.rightContext),
+ isLarge(other.isLarge)
+ {
+ std::copy(
+ other.prefixFragments,
+ other.prefixFragments+other.prefixLength,
+ prefixFragments
+ );
+ }
+ */
+
+ virtual ~DALMChartState(){
+ }
+
+ /*
+ DALMChartState &operator=(const DALMChartState &other){
+ prefixLength = other.prefixLength;
+ std::copy(
+ other.prefixFragments,
+ other.prefixFragments+other.prefixLength,
+ prefixFragments
+ );
+ rightContext = other.rightContext;
+ isLarge=other.isLarge;
+
+ return *this;
+ }
+ */
+
+ inline unsigned char GetPrefixLength() const{
+ return prefixLength;
+ }
+
+ inline unsigned char &GetPrefixLength(){
+ return prefixLength;
+ }
+
+ inline const DALM::Fragment *GetPrefixFragments() const{
+ return prefixFragments;
+ }
+
+ inline DALM::Fragment *GetPrefixFragments(){
+ return prefixFragments;
+ }
+
+ inline const DALM::State &GetRightContext() const{
+ return rightContext;
+ }
+
+ inline DALM::State &GetRightContext() {
+ return rightContext;
+ }
+
+ inline bool LargeEnough() const{
+ return isLarge;
+ }
+
+ inline void SetAsLarge() {
+ isLarge=true;
+ }
+
+ inline size_t &GetHypoSize() {
+ return hypoSize;
+ }
+ inline size_t GetHypoSize() const {
+ return hypoSize;
+ }
+
+ virtual int Compare(const FFState& other) const{
+ const DALMChartState &o = static_cast<const DALMChartState &>(other);
+ if(prefixLength < o.prefixLength) return -1;
+ if(prefixLength > o.prefixLength) return 1;
+ if(prefixLength!=0){
+ const DALM::Fragment &f = prefixFragments[prefixLength-1];
+ const DALM::Fragment &of = o.prefixFragments[prefixLength-1];
+ int ret = DALM::compare_fragments(f,of);
+ if(ret != 0) return ret;
+ }
+ if(isLarge != o.isLarge) return (int)isLarge - (int)o.isLarge;
+ if(rightContext.get_count() < o.rightContext.get_count()) return -1;
+ if(rightContext.get_count() > o.rightContext.get_count()) return 1;
+ return rightContext.compare(o.rightContext);
+ }
};
LanguageModelDALM::LanguageModelDALM(const std::string &line)
@@ -192,58 +191,62 @@ LanguageModelDALM::LanguageModelDALM(const std::string &line)
LanguageModelDALM::~LanguageModelDALM()
{
- delete m_logger;
- delete m_vocab;
- delete m_lm;
+ delete m_logger;
+ delete m_vocab;
+ delete m_lm;
}
void LanguageModelDALM::Load()
{
- /////////////////////
- // READING INIFILE //
- /////////////////////
- string inifile= m_filePath + "/dalm.ini";
-
- UTIL_THROW_IF(model.empty() || words.empty() || wordstxt.empty(),
- util::FileOpenException,
- "Failed to read DALM ini file " << m_filePath << ". Probably doesn't exist");
-
- model = m_filePath + "/" + model;
- words = m_filePath + "/" + words;
- wordstxt = m_filePath + "/" + wordstxt;
-
- // Preparing a logger object.
- m_logger = new DALM::Logger(stderr);
- m_logger->setLevel(DALM::LOGGER_INFO);
-
- // Load the vocabulary file.
- m_vocab = new DALM::Vocabulary(words, *m_logger);
-
- // Load the language model.
- m_lm = new DALM::LM(model, *m_vocab, *m_logger);
-
- wid_start = m_vocab->lookup(BOS_);
- wid_end = m_vocab->lookup(EOS_);
-
- // Load the language model.
- m_lm = new DALM::LM(model, *m_vocab, m_nGramOrder, *m_logger);
-
- wid_start = m_vocab->lookup(BOS_);
- wid_end = m_vocab->lookup(EOS_);
-
- FactorCollection &collection = FactorCollection::Instance();
- m_beginSentenceFactor = collection.AddFactor(BOS_);
+ /////////////////////
+ // READING INIFILE //
+ /////////////////////
+ string inifile= m_filePath + "/dalm.ini";
+
+ string model; // Path to the double-array file.
+ string words; // Path to the vocabulary file.
+ string wordstxt; //Path to the vocabulary file in text format.
+ read_ini(inifile.c_str(), model, words, wordstxt);
+
+ model = m_filePath + "/" + model;
+ words = m_filePath + "/" + words;
+ wordstxt = m_filePath + "/" + wordstxt;
+
+ UTIL_THROW_IF(model.empty() || words.empty() || wordstxt.empty(),
+ util::FileOpenException,
+ "Failed to read DALM ini file " << m_filePath << ". Probably doesn't exist");
+
+ ////////////////
+ // LOADING LM //
+ ////////////////
+
+ // Preparing a logger object.
+ m_logger = new DALM::Logger(stderr);
+ m_logger->setLevel(DALM::LOGGER_INFO);
+
+ // Load the vocabulary file.
+ m_vocab = new DALM::Vocabulary(words, *m_logger);
+
+ // Load the language model.
+ m_lm = new DALM::LM(model, *m_vocab, m_nGramOrder, *m_logger);
+
+ wid_start = m_vocab->lookup(BOS_);
+ wid_end = m_vocab->lookup(EOS_);
+
+ // vocab mapping
+ CreateVocabMapping(wordstxt);
+
+ FactorCollection &collection = FactorCollection::Instance();
+ m_beginSentenceFactor = collection.AddFactor(BOS_);
}
-const FFState *LanguageModelDALM::EmptyHypothesisState(const InputType &/*input*/) const
-{
- DALMState *s = new DALMState();
- m_lm->init_state(s->get_state());
- return s;
+const FFState *LanguageModelDALM::EmptyHypothesisState(const InputType &/*input*/) const{
+ DALMState *s = new DALMState();
+ m_lm->init_state(s->get_state());
+ return s;
}
-void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
-{
+void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const{
fullScore = 0;
ngramScore = 0;
@@ -251,17 +254,18 @@ void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float
size_t phraseSize = phrase.GetSize();
if (!phraseSize) return;
+
size_t currPos = 0;
size_t hist_count = 0;
- DALM::State state;
-
- if(phrase.GetWord(0).GetFactor(m_factorType) == m_beginSentenceFactor) {
- m_lm->init_state(state);
- currPos++;
- hist_count++;
- }
-
- float score;
+ DALM::State state;
+
+ if(phrase.GetWord(0).GetFactor(m_factorType) == m_beginSentenceFactor){
+ m_lm->init_state(state);
+ currPos++;
+ hist_count++;
+ }
+
+ float score;
while (currPos < phraseSize) {
const Word &word = phrase.GetWord(currPos);
hist_count++;
@@ -270,9 +274,9 @@ void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float
state.refresh();
hist_count = 0;
} else {
- DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
- score = m_lm->query(wid, state);
- fullScore += score;
+ DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
+ score = m_lm->query(wid, state);
+ fullScore += score;
if (hist_count >= m_nGramOrder) ngramScore += score;
if (wid==m_vocab->unk()) ++oovCount;
}
@@ -280,42 +284,41 @@ void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float
currPos++;
}
- fullScore = TransformLMScore(fullScore);
- ngramScore = TransformLMScore(ngramScore);
+ fullScore = TransformLMScore(fullScore);
+ ngramScore = TransformLMScore(ngramScore);
}
-FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
-{
+FFState *LanguageModelDALM::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const{
// In this function, we only compute the LM scores of n-grams that overlap a
// phrase boundary. Phrase-internal scores are taken directly from the
// translation option.
- const DALMState *dalm_ps = static_cast<const DALMState *>(ps);
-
+ const DALMState *dalm_ps = static_cast<const DALMState *>(ps);
+
// Empty phrase added? nothing to be done
- if (hypo.GetCurrTargetLength() == 0) {
+ if (hypo.GetCurrTargetLength() == 0){
return dalm_ps ? new DALMState(*dalm_ps) : NULL;
}
-
+
const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos();
//[begin, end) in STL-like fashion.
const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
const std::size_t adjust_end = std::min(end, begin + m_nGramOrder - 1);
-
+
DALMState *dalm_state = new DALMState(*dalm_ps);
- DALM::State &state = dalm_state->get_state();
+ DALM::State &state = dalm_state->get_state();
float score = 0.0;
- for(std::size_t position=begin; position < adjust_end; position++) {
- score += m_lm->query(GetVocabId(hypo.GetWord(position).GetFactor(m_factorType)), state);
+ for(std::size_t position=begin; position < adjust_end; position++){
+ score += m_lm->query(GetVocabId(hypo.GetWord(position).GetFactor(m_factorType)), state);
}
-
+
if (hypo.IsSourceCompleted()) {
// Score end of sentence.
std::vector<DALM::VocabId> indices(m_nGramOrder-1);
const DALM::VocabId *last = LastIDs(hypo, &indices.front());
m_lm->set_state(&indices.front(), (last-&indices.front()), state);
-
- score += m_lm->query(wid_end, state);
+
+ score += m_lm->query(wid_end, state);
} else if (adjust_end < end) {
// Get state after adding a long phrase.
std::vector<DALM::VocabId> indices(m_nGramOrder-1);
@@ -323,7 +326,7 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
m_lm->set_state(&indices.front(), (last-&indices.front()), state);
}
- score = TransformLMScore(score);
+ score = TransformLMScore(score);
if (OOVFeatureEnabled()) {
std::vector<float> scores(2);
scores[0] = score;
@@ -332,57 +335,56 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
} else {
out->PlusEquals(this, score);
}
-
+
return dalm_state;
}
-FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const
-{
+FFState *LanguageModelDALM::EvaluateWhenApplied(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const{
// initialize language model context state
- DALMChartState *newState = new DALMChartState();
- DALM::State &state = newState->GetRightContext();
+ DALMChartState *newState = new DALMChartState();
+ DALM::State &state = newState->GetRightContext();
- DALM::Fragment *prefixFragments = newState->GetPrefixFragments();
- unsigned char &prefixLength = newState->GetPrefixLength();
- size_t &hypoSizeAll = newState->GetHypoSize();
+ DALM::Fragment *prefixFragments = newState->GetPrefixFragments();
+ unsigned char &prefixLength = newState->GetPrefixLength();
+ size_t &hypoSizeAll = newState->GetHypoSize();
// initial language model scores
float hypoScore = 0.0; // total hypothesis score.
- const TargetPhrase &targetPhrase = hypo.GetCurrTargetPhrase();
- size_t hypoSize = targetPhrase.GetSize();
- hypoSizeAll = hypoSize;
+ const TargetPhrase &targetPhrase = hypo.GetCurrTargetPhrase();
+ size_t hypoSize = targetPhrase.GetSize();
+ hypoSizeAll = hypoSize;
// get index map for underlying hypotheses
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
targetPhrase.GetAlignNonTerm().GetNonTermIndexMap();
- size_t phrasePos = 0;
-
- // begginig of sentence.
- if(hypoSize > 0) {
- const Word &word = targetPhrase.GetWord(0);
- if(word.GetFactor(m_factorType) == m_beginSentenceFactor) {
- m_lm->init_state(state);
- // state is finalized.
- newState->SetAsLarge();
- phrasePos++;
- } else if(word.IsNonTerminal()) {
+ size_t phrasePos = 0;
+
+ // begginig of sentence.
+ if(hypoSize > 0){
+ const Word &word = targetPhrase.GetWord(0);
+ if(word.GetFactor(m_factorType) == m_beginSentenceFactor){
+ m_lm->init_state(state);
+ // state is finalized.
+ newState->SetAsLarge();
+ phrasePos++;
+ }else if(word.IsNonTerminal()){
// special case: rule starts with non-terminal -> copy everything
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[0]);
const DALMChartState* prevState =
static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
- // copy chart state
- (*newState) = (*prevState);
- hypoSizeAll = hypoSize+prevState->GetHypoSize()-1;
+ // copy chart state
+ (*newState) = (*prevState);
+ hypoSizeAll = hypoSize+prevState->GetHypoSize()-1;
// get hypoScore
- hypoScore = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
+ hypoScore = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
- phrasePos++;
- }
+ phrasePos++;
+ }
}
// loop over rule
@@ -393,30 +395,30 @@ FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featu
// regular word
if (!word.IsNonTerminal()) {
- EvaluateTerminal(
- word, hypoScore,
- newState, state,
- prefixFragments, prefixLength
- );
+ EvaluateTerminal(
+ word, hypoScore,
+ newState, state,
+ prefixFragments, prefixLength
+ );
}
// non-terminal, add phrase from underlying hypothesis
// internal non-terminal
else {
// look up underlying hypothesis
- const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
- const DALMChartState* prevState =
- static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
- size_t prevTargetPhraseLength = prevHypo->GetCurrTargetPhrase().GetSize();
- float prevHypoScore = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
- hypoSizeAll += prevState->GetHypoSize()-1;
-
- EvaluateNonTerminal(
- word, hypoScore,
- newState, state,
- prefixFragments, prefixLength,
- prevState, prevTargetPhraseLength, prevHypoScore
- );
+ const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
+ const DALMChartState* prevState =
+ static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
+ size_t prevTargetPhraseLength = prevHypo->GetCurrTargetPhrase().GetSize();
+ float prevHypoScore = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
+ hypoSizeAll += prevState->GetHypoSize()-1;
+
+ EvaluateNonTerminal(
+ word, hypoScore,
+ newState, state,
+ prefixFragments, prefixLength,
+ prevState, prevTargetPhraseLength, prevHypoScore
+ );
}
}
@@ -435,81 +437,80 @@ void LanguageModelDALM::CreateVocabMapping(const std::string &wordstxt)
{
InputFileStream vocabStrm(wordstxt);
- std::vector< std::pair<std::size_t, DALM::VocabId> > vlist;
+ std::vector< std::pair<std::size_t, DALM::VocabId> > vlist;
string line;
- std::size_t max_fid = 0;
+ std::size_t max_fid = 0;
while(getline(vocabStrm, line)) {
- const Factor *factor = FactorCollection::Instance().AddFactor(line);
- std::size_t fid = factor->GetId();
- DALM::VocabId wid = m_vocab->lookup(line.c_str());
+ const Factor *factor = FactorCollection::Instance().AddFactor(line);
+ std::size_t fid = factor->GetId();
+ DALM::VocabId wid = m_vocab->lookup(line.c_str());
- vlist.push_back(std::pair<std::size_t, DALM::VocabId>(fid, wid));
- if(max_fid < fid) max_fid = fid;
+ vlist.push_back(std::pair<std::size_t, DALM::VocabId>(fid, wid));
+ if(max_fid < fid) max_fid = fid;
}
- for(std::size_t i = 0; i < m_vocabMap.size(); i++) {
- m_vocabMap[i] = m_vocab->unk();
- }
+ for(std::size_t i = 0; i < m_vocabMap.size(); i++){
+ m_vocabMap[i] = m_vocab->unk();
+ }
- m_vocabMap.resize(max_fid+1, m_vocab->unk());
- std::vector< std::pair<std::size_t, DALM::VocabId> >::iterator it = vlist.begin();
- while(it != vlist.end()) {
- std::pair<std::size_t, DALM::VocabId> &entry = *it;
- m_vocabMap[entry.first] = entry.second;
+ m_vocabMap.resize(max_fid+1, m_vocab->unk());
+ std::vector< std::pair<std::size_t, DALM::VocabId> >::iterator it = vlist.begin();
+ while(it != vlist.end()){
+ std::pair<std::size_t, DALM::VocabId> &entry = *it;
+ m_vocabMap[entry.first] = entry.second;
- ++it;
- }
+ ++it;
+ }
}
DALM::VocabId LanguageModelDALM::GetVocabId(const Factor *factor) const
{
- std::size_t fid = factor->GetId();
- return (m_vocabMap.size() > fid)? m_vocabMap[fid] : m_vocab->unk();
+ std::size_t fid = factor->GetId();
+ return (m_vocabMap.size() > fid)? m_vocabMap[fid] : m_vocab->unk();
}
void LanguageModelDALM::SetParameter(const std::string& key, const std::string& value)
{
if (key == "factor") {
m_factorType = Scan<FactorType>(value);
- } else if (key == "order") {
- m_nGramOrder = Scan<size_t>(value);
- } else if (key == "path") {
- m_filePath = value;
+ } else if (key == "order") {
+ m_nGramOrder = Scan<size_t>(value);
+ } else if (key == "path") {
+ m_filePath = value;
} else {
LanguageModel::SetParameter(key, value);
}
- m_ContextSize = m_nGramOrder-1;
+ m_ContextSize = m_nGramOrder-1;
}
void LanguageModelDALM::EvaluateTerminal(
- const Word &word,
- float &hypoScore,
- DALMChartState *newState,
- DALM::State &state,
- DALM::Fragment *prefixFragments,
- unsigned char &prefixLength) const
-{
-
- DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
- if (newState->LargeEnough()) {
- float score = m_lm->query(wid, state);
- hypoScore += score;
- } else {
- float score = m_lm->query(wid, state, prefixFragments[prefixLength]);
-
- if(score > 0) {
- hypoScore -= score;
- newState->SetAsLarge();
- } else if(state.get_count()<=prefixLength) {
- hypoScore += score;
- prefixLength++;
- newState->SetAsLarge();
- } else {
- hypoScore += score;
- prefixLength++;
- if(prefixLength >= m_ContextSize) newState->SetAsLarge();
- }
- }
+ const Word &word,
+ float &hypoScore,
+ DALMChartState *newState,
+ DALM::State &state,
+ DALM::Fragment *prefixFragments,
+ unsigned char &prefixLength) const{
+
+ DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
+ if (newState->LargeEnough()) {
+ float score = m_lm->query(wid, state);
+ hypoScore += score;
+ }else{
+ float score = m_lm->query(wid, state, prefixFragments[prefixLength]);
+
+ if(score > 0){
+ hypoScore -= score;
+ newState->SetAsLarge();
+ }else if(state.get_count()<=prefixLength){
+ hypoScore += score;
+ prefixLength++;
+ newState->SetAsLarge();
+ }else{
+ hypoScore += score;
+ prefixLength++;
+ if(prefixLength >= m_ContextSize) newState->SetAsLarge();
+ }
+ }
}
void LanguageModelDALM::EvaluateNonTerminal(
@@ -519,75 +520,74 @@ void LanguageModelDALM::EvaluateNonTerminal(
DALM::State &state,
DALM::Fragment *prefixFragments,
unsigned char &prefixLength,
- const DALMChartState *prevState,
- size_t prevTargetPhraseLength,
- float prevHypoScore
-) const
-{
+ const DALMChartState *prevState,
+ size_t prevTargetPhraseLength,
+ float prevHypoScore
+ ) const{
const unsigned char prevPrefixLength = prevState->GetPrefixLength();
- const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
- hypoScore += prevHypoScore;
-
- if(prevPrefixLength == 0) {
- newState->SetAsLarge();
- hypoScore += state.sum_bows(0, state.get_count());
- state = prevState->GetRightContext();
- return;
- }
- if(!state.has_context()) {
- newState->SetAsLarge();
- state = prevState->GetRightContext();
- return;
- }
- DALM::Gap gap(state);
+ const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
+ hypoScore += prevHypoScore;
+
+ if(prevPrefixLength == 0){
+ newState->SetAsLarge();
+ hypoScore += state.sum_bows(0, state.get_count());
+ state = prevState->GetRightContext();
+ return;
+ }
+ if(!state.has_context()){
+ newState->SetAsLarge();
+ state = prevState->GetRightContext();
+ return;
+ }
+ DALM::Gap gap(state);
// score its prefix
for(size_t prefixPos = 0; prefixPos < prevPrefixLength; prefixPos++) {
- const DALM::Fragment &f = prevPrefixFragments[prefixPos];
- if (newState->LargeEnough()) {
- float score = m_lm->query(f, state, gap);
- hypoScore += score;
-
- if(!gap.is_extended()) {
- state = prevState->GetRightContext();
- return;
- } else if(state.get_count() <= prefixPos+1) {
- state = prevState->GetRightContext();
- return;
- }
- } else {
- DALM::Fragment &fnew = prefixFragments[prefixLength];
- float score = m_lm->query(f, state, gap, fnew);
- hypoScore += score;
-
- if(!gap.is_extended()) {
- newState->SetAsLarge();
- state = prevState->GetRightContext();
- return;
- } else if(state.get_count() <= prefixPos+1) {
- if(!gap.is_finalized()) prefixLength++;
- newState->SetAsLarge();
- state = prevState->GetRightContext();
- return;
- } else if(gap.is_finalized()) {
- newState->SetAsLarge();
- } else {
- prefixLength++;
- if(prefixLength >= m_ContextSize) newState->SetAsLarge();
- }
- }
- gap.succ();
+ const DALM::Fragment &f = prevPrefixFragments[prefixPos];
+ if (newState->LargeEnough()) {
+ float score = m_lm->query(f, state, gap);
+ hypoScore += score;
+
+ if(!gap.is_extended()){
+ state = prevState->GetRightContext();
+ return;
+ }else if(state.get_count() <= prefixPos+1){
+ state = prevState->GetRightContext();
+ return;
+ }
+ } else {
+ DALM::Fragment &fnew = prefixFragments[prefixLength];
+ float score = m_lm->query(f, state, gap, fnew);
+ hypoScore += score;
+
+ if(!gap.is_extended()){
+ newState->SetAsLarge();
+ state = prevState->GetRightContext();
+ return;
+ }else if(state.get_count() <= prefixPos+1){
+ if(!gap.is_finalized()) prefixLength++;
+ newState->SetAsLarge();
+ state = prevState->GetRightContext();
+ return;
+ }else if(gap.is_finalized()){
+ newState->SetAsLarge();
+ }else{
+ prefixLength++;
+ if(prefixLength >= m_ContextSize) newState->SetAsLarge();
+ }
+ }
+ gap.succ();
}
// check if we are dealing with a large sub-phrase
if (prevState->LargeEnough()) {
newState->SetAsLarge();
- if(prevPrefixLength < prevState->GetHypoSize()) {
- hypoScore += state.sum_bows(prevPrefixLength, state.get_count());
- }
- // copy language model state
- state = prevState->GetRightContext();
+ if(prevPrefixLength < prevState->GetHypoSize()){
+ hypoScore += state.sum_bows(prevPrefixLength, state.get_count());
+ }
+ // copy language model state
+ state = prevState->GetRightContext();
} else {
m_lm->set_state(state, prevState->GetRightContext(), prevPrefixFragments, gap);
}
diff --git a/moses/LM/DALMWrapper.h b/moses/LM/DALMWrapper.h
index e6ba90d51..3f80adadf 100644
--- a/moses/LM/DALMWrapper.h
+++ b/moses/LM/DALMWrapper.h
@@ -34,9 +34,9 @@ public:
virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
- virtual FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
+ virtual FFState *EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
- virtual FFState *EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const;
+ virtual FFState *EvaluateWhenApplied(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const;
virtual bool IsUseable(const FactorMask &mask) const;
diff --git a/moses/LM/IRST.cpp b/moses/LM/IRST.cpp
index 2db636015..19a5f2c82 100644
--- a/moses/LM/IRST.cpp
+++ b/moses/LM/IRST.cpp
@@ -44,6 +44,7 @@ namespace Moses
LanguageModelIRST::LanguageModelIRST(const std::string &line)
:LanguageModelSingleFactor(line)
+ ,m_lmtb_dub(0)
{
const StaticData &staticData = StaticData::Instance();
int threadCount = staticData.ThreadCount();
diff --git a/moses/LM/Implementation.cpp b/moses/LM/Implementation.cpp
index b14098bc1..48c654284 100644
--- a/moses/LM/Implementation.cpp
+++ b/moses/LM/Implementation.cpp
@@ -134,7 +134,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
}
}
-FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
+FFState *LanguageModelImplementation::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
{
// In this function, we only compute the LM scores of n-grams that overlap a
// phrase boundary. Phrase-internal scores are taken directly from the
@@ -222,7 +222,7 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
return res;
}
-FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out) const
+FFState* LanguageModelImplementation::EvaluateWhenApplied(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out) const
{
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder());
// data structure for factored context phrase (history and predicted word)
@@ -338,7 +338,15 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
}
// assign combined score to score breakdown
- out->Assign(this, prefixScore + finalizedScore);
+ if (OOVFeatureEnabled()) {
+ vector<float> scores(2);
+ scores[0] = prefixScore + finalizedScore;
+ scores[1] = out->GetScoresForProducer(this)[1];
+ out->Assign(this, scores);
+ }
+ else {
+ out->Assign(this, prefixScore + finalizedScore);
+ }
ret->Set(prefixScore, lmState);
return ret;
diff --git a/moses/LM/Implementation.h b/moses/LM/Implementation.h
index a39f5e42b..5eb8fb209 100644
--- a/moses/LM/Implementation.h
+++ b/moses/LM/Implementation.h
@@ -89,9 +89,9 @@ public:
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
- FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
+ FFState *EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
- FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const;
+ FFState* EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const;
void updateChartScore(float *prefixScore, float *finalScore, float score, size_t wordPos) const;
diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile
index 4f964ddd8..3d68d161b 100644
--- a/moses/LM/Jamfile
+++ b/moses/LM/Jamfile
@@ -19,7 +19,7 @@ if $(with-irstlm) {
dependencies += irst ;
lmmacros += LM_IRST ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
- echo "!!! You are linking the IRSTLM library; be sure the release is >= 5.70.02 !!!" ;
+ echo "!!! You are linking with the IRSTLM library; be sure the release is >= 5.70.02 !!!" ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
}
@@ -27,6 +27,10 @@ if $(with-irstlm) {
local with-srilm = [ option.get "with-srilm" ] ;
local with-maxent-srilm = [ option.get "with-maxent-srilm" ] ;
if $(with-srilm) {
+ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+ echo "!!! You are linking with the SRILM library; Do NOT use version >= 1.7.1 !!!" ;
+ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+
if [ option.get "with-srilm-dynamic" : no : yes ] = yes {
lib srilm ;
alias sri-libs : srilm ;
@@ -83,13 +87,29 @@ if $(with-ldhtlm) {
#NPLM
local with-nplm = [ option.get "with-nplm" ] ;
if $(with-nplm) {
- lib neuralLM : : <search>$(with-nplm)/lib <search>$(with-nplm)/lib64 ;
- obj NeuralLMWrapper.o : NeuralLMWrapper.cpp neuralLM ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen-3.1.4 ;
- alias nplm : NeuralLMWrapper.o neuralLM : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
- dependencies += nplm ;
+ lib nplm : : <search>$(with-nplm)/lib <search>$(with-nplm)/lib64 ;
+ obj NeuralLMWrapper.o : NeuralLMWrapper.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
+ obj BiLM_NPLM.o : bilingual-lm/BiLM_NPLM.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
+ alias neural : NeuralLMWrapper.o BiLM_NPLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
+ dependencies += neural ;
lmmacros += LM_NEURAL ;
}
+#OxLM
+local with-oxlm = [ option.get "with-oxlm" ] ;
+if $(with-oxlm) {
+ lib lbl : : <search>$(with-oxlm)/lib <search>$(with-oxlm)/lib64 ;
+ lib murmurhash : : <search>$(with-oxlm)/lib <search>$(with-oxlm)/lib64 ;
+ obj OxLM.o : oxlm/OxLM.cpp lbl ..//headers : <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
+ obj SourceOxLM.o : oxlm/SourceOxLM.cpp lbl ..//headers : <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
+ obj OxLMMapper.o : oxlm/OxLMMapper.cpp lbl ..//headers : <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
+ obj OxLMParallelMapper.o : oxlm/OxLMParallelMapper.cpp lbl ..//headers : <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
+ alias oxlm : OxLM.o SourceOxLM.o OxLMMapper.o OxLMParallelMapper.o lbl murmurhash /top//boost_filesystem : : : <cxxflags>-std=c++0x <define>LM_OXLM ;
+ dependencies += oxlm ;
+ lmmacros += LM_OXLM ;
+}
+
+
#DALM
local with-dalm = [ option.get "with-dalm" ] ;
if $(with-dalm) {
@@ -113,7 +133,7 @@ obj ORLM.o : ORLM.cpp ..//headers ../TranslationModel/DynSAInclude//dynsa : : :
#Top-level LM library. If you've added a file that doesn't depend on external
#libraries, put it here.
-alias LM : Backward.cpp BackwardLMState.cpp Base.cpp Implementation.cpp Joint.cpp Ken.cpp MultiFactor.cpp Remote.cpp SingleFactor.cpp SkeletonLM.cpp ORLM.o
+alias LM : Backward.cpp BackwardLMState.cpp Base.cpp BilingualLM.cpp Implementation.cpp Joint.cpp Ken.cpp MultiFactor.cpp Remote.cpp SingleFactor.cpp SkeletonLM.cpp ORLM.o
../../lm//kenlm ..//headers $(dependencies) ;
alias macros : : : : <define>$(lmmacros) ;
diff --git a/moses/LM/Ken.cpp b/moses/LM/Ken.cpp
index 942c247ba..59fb04d16 100644
--- a/moses/LM/Ken.cpp
+++ b/moses/LM/Ken.cpp
@@ -43,6 +43,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/ChartHypothesis.h"
#include "moses/Incremental.h"
#include "moses/UserMessage.h"
+#include "moses/Syntax/SVertex.h"
using namespace std;
@@ -79,7 +80,7 @@ struct KenLMState : public FFState {
//
// FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
//
-// FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
+// FFState *EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
//
// void IncrementalCallback(Incremental::Manager &manager) const {
// manager.LMCallback(*m_ngram, m_lmIdLookup);
@@ -229,7 +230,7 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
fullScore = TransformLMScore(fullScore);
}
-template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
+template <class Model> FFState *LanguageModelKen<Model>::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
{
const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state;
@@ -307,7 +308,7 @@ private:
lm::ngram::ChartState m_state;
};
-template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const
+template <class Model> FFState *LanguageModelKen<Model>::EvaluateWhenApplied(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const
{
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
@@ -348,6 +349,59 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
float score = ruleScore.Finish();
score = TransformLMScore(score);
+ if (OOVFeatureEnabled()) {
+ std::vector<float> scores(2);
+ scores[0] = score;
+ scores[1] = 0.0;
+ accumulator->Assign(this, scores);
+ }
+ else {
+ accumulator->Assign(this, score);
+ }
+ return newState;
+}
+
+template <class Model> FFState *LanguageModelKen<Model>::EvaluateWhenApplied(const Syntax::SHyperedge& hyperedge, int featureID, ScoreComponentCollection *accumulator) const
+{
+ LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
+ lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
+ const TargetPhrase &target = *hyperedge.translation;
+ const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
+ target.GetAlignNonTerm().GetNonTermIndexMap2();
+
+ const size_t size = target.GetSize();
+ size_t phrasePos = 0;
+ // Special cases for first word.
+ if (size) {
+ const Word &word = target.GetWord(0);
+ if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
+ // Begin of sentence
+ ruleScore.BeginSentence();
+ phrasePos++;
+ } else if (word.IsNonTerminal()) {
+ // Non-terminal is first so we can copy instead of rescoring.
+ const Syntax::SVertex *pred = hyperedge.tail[nonTermIndexMap[phrasePos]];
+ const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(pred->state[featureID])->GetChartState();
+ float prob = UntransformLMScore(pred->best->scoreBreakdown.GetScoresForProducer(this)[0]);
+ ruleScore.BeginNonTerminal(prevState, prob);
+ phrasePos++;
+ }
+ }
+
+ for (; phrasePos < size; phrasePos++) {
+ const Word &word = target.GetWord(phrasePos);
+ if (word.IsNonTerminal()) {
+ const Syntax::SVertex *pred = hyperedge.tail[nonTermIndexMap[phrasePos]];
+ const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(pred->state[featureID])->GetChartState();
+ float prob = UntransformLMScore(pred->best->scoreBreakdown.GetScoresForProducer(this)[0]);
+ ruleScore.NonTerminal(prevState, prob);
+ } else {
+ ruleScore.Terminal(TranslateID(word));
+ }
+ }
+
+ float score = ruleScore.Finish();
+ score = TransformLMScore(score);
accumulator->Assign(this, score);
return newState;
}
diff --git a/moses/LM/Ken.h b/moses/LM/Ken.h
index e5950f591..a2fdb6013 100644
--- a/moses/LM/Ken.h
+++ b/moses/LM/Ken.h
@@ -55,9 +55,11 @@ public:
virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
- virtual FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
+ virtual FFState *EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
- virtual FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
+ virtual FFState *EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
+
+ virtual FFState *EvaluateWhenApplied(const Syntax::SHyperedge& hyperedge, int featureID, ScoreComponentCollection *accumulator) const;
virtual void IncrementalCallback(Incremental::Manager &manager) const;
virtual void ReportHistoryOrder(std::ostream &out,const Phrase &phrase) const;
diff --git a/moses/LM/LDHT.cpp b/moses/LM/LDHT.cpp
index 61226208c..1d0331df5 100644
--- a/moses/LM/LDHT.cpp
+++ b/moses/LM/LDHT.cpp
@@ -97,7 +97,7 @@ public:
FFState* Evaluate(const Hypothesis& hypo,
const FFState* input_state,
ScoreComponentCollection* score_output) const;
- FFState* EvaluateChart(const ChartHypothesis& hypo,
+ FFState* EvaluateWhenApplied(const ChartHypothesis& hypo,
int featureID,
ScoreComponentCollection* accumulator) const;
@@ -392,7 +392,7 @@ FFState* LanguageModelLDHT::Evaluate(
return state;
}
-FFState* LanguageModelLDHT::EvaluateChart(
+FFState* LanguageModelLDHT::EvaluateWhenApplied(
const ChartHypothesis& hypo,
int featureID,
ScoreComponentCollection* accumulator) const
diff --git a/moses/LM/NeuralLMWrapper.cpp b/moses/LM/NeuralLMWrapper.cpp
index cbcde3bc3..22ff90bb9 100644
--- a/moses/LM/NeuralLMWrapper.cpp
+++ b/moses/LM/NeuralLMWrapper.cpp
@@ -1,9 +1,9 @@
#include "moses/StaticData.h"
#include "moses/FactorCollection.h"
+#include <boost/functional/hash.hpp>
#include "NeuralLMWrapper.h"
#include "neuralLM.h"
-#include <model.h>
using namespace std;
@@ -12,21 +12,19 @@ namespace Moses
NeuralLMWrapper::NeuralLMWrapper(const std::string &line)
:LanguageModelSingleFactor(line)
{
- // This space intentionally left blank
+ ReadParameters();
}
NeuralLMWrapper::~NeuralLMWrapper()
{
- delete m_neuralLM;
+ delete m_neuralLM_shared;
}
void NeuralLMWrapper::Load()
{
- TRACE_ERR("Loading NeuralLM " << m_filePath << endl);
-
// Set parameters required by ancestor classes
FactorCollection &factorCollection = FactorCollection::Instance();
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
@@ -34,58 +32,46 @@ void NeuralLMWrapper::Load()
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
- m_neuralLM = new nplm::neuralLM();
- m_neuralLM->read(m_filePath);
- m_neuralLM->set_log_base(10);
+ m_neuralLM_shared = new nplm::neuralLM();
+ m_neuralLM_shared->read(m_filePath);
+ m_neuralLM_shared->premultiply();
+ //TODO: config option?
+ m_neuralLM_shared->set_cache(1000000);
+
+ m_unk = m_neuralLM_shared->lookup_word("<unk>");
+
+ UTIL_THROW_IF2(m_nGramOrder != m_neuralLM_shared->get_order(),
+ "Wrong order of neuralLM: LM has " << m_neuralLM_shared->get_order() << ", but Moses expects " << m_nGramOrder);
- //TODO: Implement this
}
LMResult NeuralLMWrapper::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
{
- unsigned int hashCode = 0;
+ if (!m_neuralLM.get()) {
+ m_neuralLM.reset(new nplm::neuralLM(*m_neuralLM_shared));
+ //TODO: config option?
+ m_neuralLM->set_cache(1000000);
+ }
+ size_t hashCode = 0;
+
vector<int> words(contextFactor.size());
-// TRACE_ERR("NeuralLM words:");
- for (size_t i=0, n=contextFactor.size(); i<n; i+=1) {
+ for (size_t i=0, n=contextFactor.size(); i<n; i++) {
const Word* word = contextFactor[i];
const Factor* factor = word->GetFactor(m_factorType);
- const std::string string= factor->GetString().as_string();
+ const std::string string = factor->GetString().as_string();
int neuralLM_wordID = m_neuralLM->lookup_word(string);
words[i] = neuralLM_wordID;
- hashCode += neuralLM_wordID;
-// TRACE_ERR(" " << string << "(" << neuralLM_wordID << ")" );
+ boost::hash_combine(hashCode, neuralLM_wordID);
}
double value = m_neuralLM->lookup_ngram(words);
-// TRACE_ERR("\t=\t" << value);
-// TRACE_ERR(endl);
// Create a new struct to hold the result
LMResult ret;
- ret.score = value;
- ret.unknown = false;
-
-
- // State* finalState is a void pointer
- //
- // Construct a hash value from the vector of words (contextFactor)
- //
- // The hash value must be the same size as sizeof(void*)
- //
- // TODO Set finalState to the above hash value
-
- // use last word as state info
-// const Factor *factor;
-// size_t hash_value(const Factor &f);
-// if (contextFactor.size()) {
-// factor = contextFactor.back()->GetFactor(m_factorType);
-// } else {
-// factor = NULL;
-// }
-//
-// (*finalState) = (State*) factor;
+ ret.score = FloorScore(value);
+ ret.unknown = (words.back() == m_unk);
(*finalState) = (State*) hashCode;
diff --git a/moses/LM/NeuralLMWrapper.h b/moses/LM/NeuralLMWrapper.h
index a8146e9bd..2b80fb303 100644
--- a/moses/LM/NeuralLMWrapper.h
+++ b/moses/LM/NeuralLMWrapper.h
@@ -2,24 +2,26 @@
#include "SingleFactor.h"
-namespace nplm
-{
-class neuralLM;
+#include <boost/thread/tss.hpp>
+
+namespace nplm {
+ class neuralLM;
}
namespace Moses
{
-/** Implementation of single factor LM using IRST's code.
- */
class NeuralLMWrapper : public LanguageModelSingleFactor
{
protected:
- nplm::neuralLM *m_neuralLM;
+ // big data (vocab, weights, cache) shared among threads
+ nplm::neuralLM *m_neuralLM_shared;
+ // thread-specific nplm for thread-safety
+ mutable boost::thread_specific_ptr<nplm::neuralLM> m_neuralLM;
+ int m_unk;
public:
NeuralLMWrapper(const std::string &line);
- // NeuralLM(const std::string &line);
~NeuralLMWrapper();
virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = 0) const;
diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp
index 0a5837874..cf080f04a 100644
--- a/moses/LM/SingleFactor.cpp
+++ b/moses/LM/SingleFactor.cpp
@@ -87,6 +87,17 @@ void LanguageModelSingleFactor::SetParameter(const std::string& key, const std::
}
}
+std::string LanguageModelSingleFactor::DebugContextFactor(const std::vector<const Word*> &contextFactor) const
+{
+ std::string ret;
+ for (size_t i = 0; i < contextFactor.size(); ++i) {
+ const Word &word = *contextFactor[i];
+ ret += word.ToString();
+ }
+
+ return ret;
+}
+
}
diff --git a/moses/LM/SingleFactor.h b/moses/LM/SingleFactor.h
index eeb5cdbef..fd1d893e6 100644
--- a/moses/LM/SingleFactor.h
+++ b/moses/LM/SingleFactor.h
@@ -67,6 +67,8 @@ public:
virtual LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL) const = 0;
+
+ std::string DebugContextFactor(const std::vector<const Word*> &contextFactor) const;
};
diff --git a/moses/LM/bilingual-lm/BiLM_NPLM.cpp b/moses/LM/bilingual-lm/BiLM_NPLM.cpp
new file mode 100644
index 000000000..190aade1c
--- /dev/null
+++ b/moses/LM/bilingual-lm/BiLM_NPLM.cpp
@@ -0,0 +1,138 @@
+#include "BiLM_NPLM.h"
+#include "neuralLM.h"
+#include "vocabulary.h"
+
+namespace Moses {
+
+BilingualLM_NPLM::BilingualLM_NPLM(const std::string &line)
+ : BilingualLM(line),
+ premultiply(true),
+ factored(false),
+ neuralLM_cache(1000000) {
+
+ if (!NULL_overwrite) {
+ NULL_string = "<null>"; //Default null value for nplm
+ }
+ FactorCollection& factorFactory = FactorCollection::Instance(); // To add null word.
+ const Factor* NULL_factor = factorFactory.AddFactor(NULL_string);
+ NULL_word.SetFactor(0, NULL_factor);
+ }
+
+float BilingualLM_NPLM::Score(std::vector<int>& source_words, std::vector<int>& target_words) const {
+ source_words.reserve(source_ngrams+target_ngrams+1);
+ source_words.insert( source_words.end(), target_words.begin(), target_words.end() );
+ return FloorScore(m_neuralLM->lookup_ngram(source_words));
+}
+
+const Word& BilingualLM_NPLM::getNullWord() const {
+ return NULL_word;
+}
+
+int BilingualLM_NPLM::getNeuralLMId(const Word& word, bool is_source_word) const {
+ initSharedPointer();
+
+ //Decide if we are doing source or target side first.
+ boost::unordered_map<const Factor*, int> * neuralLMids;
+ int unknown_word_id;
+ if (is_source_word) {
+ neuralLMids = &source_neuralLMids;
+ unknown_word_id = source_unknown_word_id;
+ } else {
+ neuralLMids = &target_neuralLMids;
+ unknown_word_id = target_unknown_word_id;
+ }
+
+ boost::unordered_map<const Factor*, int>::iterator it;
+ const Factor* factor = word.GetFactor(word_factortype);
+
+ it = neuralLMids->find(factor);
+ //If we know the word return immediately
+ if (it != neuralLMids->end()){
+ return it->second;
+ }
+ //If we don't know the word and we aren't factored, return the word.
+ if (!factored) {
+ return unknown_word_id;
+ }
+ //Else try to get a pos_factor
+ const Factor* pos_factor = word.GetFactor(pos_factortype);
+ it = neuralLMids->find(pos_factor);
+ if (it != neuralLMids->end()){
+ return it->second;
+ } else {
+ return unknown_word_id;
+ }
+}
+
+void BilingualLM_NPLM::initSharedPointer() const {
+ if (!m_neuralLM.get()) {
+ m_neuralLM.reset(new nplm::neuralLM(*m_neuralLM_shared));
+ }
+}
+
+void BilingualLM_NPLM::SetParameter(const std::string& key, const std::string& value) {
+ if (key == "target_ngrams") {
+ target_ngrams = Scan<int>(value);
+ } else if (key == "source_ngrams") {
+ source_ngrams = Scan<int>(value);
+ } else if (key == "factored") {
+ factored = Scan<bool>(value);
+ } else if (key == "pos_factor") {
+ pos_factortype = Scan<FactorType>(value);
+ } else if (key == "source_vocab") {
+ source_vocab_path = value;
+ } else if (key == "target_vocab") {
+ target_vocab_path = value;
+ } else if (key == "cache_size") {
+ neuralLM_cache = atoi(value.c_str());
+ } else if (key == "premultiply") {
+ premultiply = Scan<bool>(value);
+ } else if (key == "null_word") {
+ NULL_string = value;
+ NULL_overwrite = true;
+ } else {
+ BilingualLM::SetParameter(key, value);
+ }
+}
+
+void BilingualLM_NPLM::loadModel() {
+ m_neuralLM_shared = new nplm::neuralLM();
+ m_neuralLM_shared->read(m_filePath);
+ if (premultiply) {
+ m_neuralLM_shared->premultiply();
+ }
+
+ int ngram_order = target_ngrams + source_ngrams + 1;
+ UTIL_THROW_IF2(
+ ngram_order != m_neuralLM_shared->get_order(),
+ "Wrong order of neuralLM: LM has " << m_neuralLM_shared->get_order() <<
+ ", but Moses expects " << ngram_order);
+
+ m_neuralLM_shared->set_cache(neuralLM_cache); //Default 1000000
+
+ //Setup factor -> NeuralLMId cache. First target words
+ FactorCollection& factorFactory = FactorCollection::Instance(); //To do the conversion from string to vocabID
+ int wordid_counter = 0;
+ target_unknown_word_id = wordid_counter; //The first word is <unk>
+ std::string raw_word;
+ std::ifstream infile_target(target_vocab_path.c_str());
+ while (infile_target >> raw_word) {
+ const Factor * factor = factorFactory.AddFactor(raw_word);
+ target_neuralLMids.insert(std::make_pair(factor, wordid_counter));
+ wordid_counter++;
+ }
+ infile_target.close();
+ source_unknown_word_id = wordid_counter; //The first word is <unk> from the next file
+
+ //Source words now:
+ std::ifstream infile_source(source_vocab_path.c_str());
+ while (infile_source >> raw_word) {
+ const Factor * factor = factorFactory.AddFactor(raw_word);
+ source_neuralLMids.insert(std::make_pair(factor, wordid_counter));
+ wordid_counter++;
+ }
+ infile_source.close();
+
+}
+
+} // namespace Moses
diff --git a/moses/LM/bilingual-lm/BiLM_NPLM.h b/moses/LM/bilingual-lm/BiLM_NPLM.h
new file mode 100644
index 000000000..9a3167455
--- /dev/null
+++ b/moses/LM/bilingual-lm/BiLM_NPLM.h
@@ -0,0 +1,49 @@
+#include "moses/LM/BilingualLM.h"
+#include <boost/unordered_map.hpp>
+#include <utility> //make_pair
+#include <fstream> //Read vocabulary files
+
+namespace nplm {
+ class neuralLM;
+}
+
+namespace Moses {
+
+class BilingualLM_NPLM : public BilingualLM {
+ public:
+ BilingualLM_NPLM(const std::string &line);
+
+ private:
+ float Score(std::vector<int>& source_words, std::vector<int>& target_words) const;
+
+ int getNeuralLMId(const Word& word, bool is_source_word) const;
+
+ void initSharedPointer() const;
+
+ void loadModel();
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ const Word& getNullWord() const;
+
+ nplm::neuralLM *m_neuralLM_shared;
+ mutable boost::thread_specific_ptr<nplm::neuralLM> m_neuralLM;
+
+ mutable boost::unordered_map<const Factor*, int> target_neuralLMids;
+ mutable boost::unordered_map<const Factor*, int> source_neuralLMids;
+
+ //const Factor* NULL_factor_overwrite;
+ std::string NULL_string;
+ bool NULL_overwrite;
+ Word NULL_word;
+
+ std::string source_vocab_path;
+ std::string target_vocab_path;
+ bool premultiply;
+ bool factored;
+ int neuralLM_cache;
+ int source_unknown_word_id;
+ int target_unknown_word_id;
+};
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/OxLM.cpp b/moses/LM/oxlm/OxLM.cpp
new file mode 100644
index 000000000..5047a0344
--- /dev/null
+++ b/moses/LM/oxlm/OxLM.cpp
@@ -0,0 +1,209 @@
+#include "OxLM.h"
+
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/functional/hash.hpp>
+
+#include "moses/FactorCollection.h"
+#include "moses/InputType.h"
+
+using namespace std;
+using namespace oxlm;
+
+namespace Moses
+{
+
+template<class Model>
+OxLM<Model>::OxLM(const string &line)
+ : LanguageModelSingleFactor(line), normalized(true),
+ posBackOff(false), posFactorType(1),
+ persistentCache(false) {
+ ReadParameters();
+
+ FactorCollection &factorCollection = FactorCollection::Instance();
+
+ // needed by parent language model classes. Why didn't they set these themselves?
+ m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
+ m_sentenceStartWord[m_factorType] = m_sentenceStart;
+
+ m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
+ m_sentenceEndWord[m_factorType] = m_sentenceEnd;
+
+ cacheHits = totalHits = 0;
+}
+
+
+template<class Model>
+OxLM<Model>::~OxLM() {
+ if (persistentCache) {
+ if (cache.get()) {
+ string cache_file = m_filePath + ".phrases.cache.bin";
+ savePersistentCache(cache_file);
+ }
+
+ double cache_hit_ratio = 100.0 * cacheHits / totalHits;
+ cerr << "Cache hit ratio: " << cache_hit_ratio << endl;
+ }
+}
+
+
+template<class Model>
+void OxLM<Model>::SetParameter(const string& key, const string& value) {
+ if (key == "normalized") {
+ normalized = Scan<bool>(value);
+ } else if (key == "persistent-cache") {
+ persistentCache = Scan<bool>(value);
+ } else if (key == "normalized") {
+ normalized = Scan<bool>(value);
+ } else if (key == "pos-back-off") {
+ posBackOff = Scan<bool>(value);
+ } else if (key == "pos-factor-type") {
+ posFactorType = Scan<FactorType>(value);
+ } else {
+ LanguageModelSingleFactor::SetParameter(key, value);
+ }
+}
+
+template<class Model>
+void OxLM<Model>::Load() {
+ model.load(m_filePath);
+
+ boost::shared_ptr<Vocabulary> vocab = model.getVocab();
+ mapper = boost::make_shared<OxLMMapper>(vocab, posBackOff, posFactorType);
+
+ kSTART = vocab->convert("<s>");
+ kSTOP = vocab->convert("</s>");
+ kUNKNOWN = vocab->convert("<unk>");
+
+ size_t ngram_order = model.getConfig()->ngram_order;
+ UTIL_THROW_IF2(
+ m_nGramOrder != ngram_order,
+ "Wrong order for OxLM: LM has " << ngram_order << ", but Moses expects " << m_nGramOrder);
+}
+
+template<class Model>
+double OxLM<Model>::GetScore(int word, const vector<int>& context) const {
+ if (normalized) {
+ return model.getLogProb(word, context);
+ } else {
+ return model.getUnnormalizedScore(word, context);
+ }
+}
+
+template<class Model>
+LMResult OxLM<Model>::GetValue(
+ const vector<const Word*> &contextFactor, State* finalState) const {
+ if (!cache.get()) {
+ cache.reset(new QueryCache());
+ string cache_file = m_filePath + ".phrases.cache.bin";
+ loadPersistentCache(cache_file);
+ }
+
+ vector<int> context;
+ int word;
+ mapper->convert(contextFactor, context, word);
+
+ size_t context_width = m_nGramOrder - 1;
+ if (!context.empty() && context.back() == kSTART) {
+ context.resize(context_width, kSTART);
+ } else {
+ context.resize(context_width, kUNKNOWN);
+ }
+
+ double score;
+ if (persistentCache) {
+ ++totalHits;
+ NGram query(word, context);
+ pair<double, bool> ret = cache->get(query);
+ if (ret.second) {
+ score = ret.first;
+ ++cacheHits;
+ } else {
+ score = GetScore(word, context);
+ cache->put(query, score);
+ }
+ } else {
+ score = GetScore(word, context);
+ }
+
+ LMResult ret;
+ ret.score = score;
+ ret.unknown = (word == kUNKNOWN);
+
+ // calc state from hash of last n-1 words
+ size_t seed = 0;
+ boost::hash_combine(seed, word);
+ for (size_t i = 0; i < context.size() && i < context_width - 1; ++i) {
+ int id = context[i];
+ boost::hash_combine(seed, id);
+ }
+
+ (*finalState) = (State*) seed;
+ return ret;
+}
+
+template<class Model>
+void OxLM<Model>::loadPersistentCache(const string& cache_file) const {
+ if (boost::filesystem::exists(cache_file)) {
+ ifstream f(cache_file);
+ boost::archive::binary_iarchive iar(f);
+ cerr << "Loading n-gram probability cache from " << cache_file << endl;
+ iar >> *cache;
+ cerr << "Done loading " << cache->size()
+ << " n-gram probabilities..." << endl;
+ } else {
+ cerr << "Cache file not found" << endl;
+ }
+}
+
+template<class Model>
+void OxLM<Model>::savePersistentCache(const string& cache_file) const {
+ ofstream f(cache_file);
+ boost::archive::binary_oarchive oar(f);
+ cerr << "Saving persistent cache to " << cache_file << endl;
+ oar << *cache;
+ cerr << "Done saving " << cache->size()
+ << " n-gram probabilities..." << endl;
+}
+
+template<class Model>
+void OxLM<Model>::InitializeForInput(const InputType& source) {
+ LanguageModelSingleFactor::InitializeForInput(source);
+
+ if (persistentCache) {
+ if (!cache.get()) {
+ cache.reset(new QueryCache());
+ }
+
+ int sentence_id = source.GetTranslationId();
+ string cache_file = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
+ loadPersistentCache(cache_file);
+ }
+}
+
+template<class Model>
+void OxLM<Model>::CleanUpAfterSentenceProcessing(const InputType& source) {
+ // Thread safe: the model cache is thread specific.
+ model.clearCache();
+
+ if (persistentCache) {
+ int sentence_id = source.GetTranslationId();
+ string cache_file = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
+ savePersistentCache(cache_file);
+
+ cache->clear();
+ }
+
+ LanguageModelSingleFactor::CleanUpAfterSentenceProcessing(source);
+}
+
+template class OxLM<LM>;
+template class OxLM<FactoredLM>;
+template class OxLM<FactoredMaxentLM>;
+template class OxLM<FactoredTreeLM>;
+
+}
+
+
+
diff --git a/moses/LM/oxlm/OxLM.h b/moses/LM/oxlm/OxLM.h
new file mode 100644
index 000000000..a528d0882
--- /dev/null
+++ b/moses/LM/oxlm/OxLM.h
@@ -0,0 +1,60 @@
+// $Id$
+#pragma once
+
+#include <vector>
+
+#include "moses/LM/SingleFactor.h"
+
+// lbl stuff
+#include "lbl/model.h"
+#include "lbl/query_cache.h"
+
+#include "OxLMMapper.h"
+
+namespace Moses {
+
+template<class Model>
+class OxLM : public LanguageModelSingleFactor {
+ public:
+ OxLM(const std::string &line);
+
+ ~OxLM();
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ void Load();
+
+ virtual LMResult GetValue(
+ const std::vector<const Word*> &contextFactor,
+ State* finalState = 0) const;
+
+ virtual void InitializeForInput(const InputType& source);
+
+ virtual void CleanUpAfterSentenceProcessing(const InputType& source);
+
+ private:
+ double GetScore(int word, const vector<int>& context) const;
+
+ void loadPersistentCache(const string& cache_file) const;
+
+ void savePersistentCache(const string& cache_file) const;
+
+ protected:
+ Model model;
+ boost::shared_ptr<OxLMMapper> mapper;
+
+ int kSTART;
+ int kSTOP;
+ int kUNKNOWN;
+
+ bool normalized;
+
+ bool posBackOff;
+ FactorType posFactorType;
+
+ bool persistentCache;
+ mutable boost::thread_specific_ptr<oxlm::QueryCache> cache;
+ mutable int cacheHits, totalHits;
+};
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/OxLMMapper.cpp b/moses/LM/oxlm/OxLMMapper.cpp
new file mode 100644
index 000000000..f2953b4e9
--- /dev/null
+++ b/moses/LM/oxlm/OxLMMapper.cpp
@@ -0,0 +1,47 @@
+#include "moses/LM/oxlm/OxLMMapper.h"
+
+#include "moses/FactorCollection.h"
+
+using namespace std;
+
+namespace Moses {
+
+OxLMMapper::OxLMMapper(
+ const boost::shared_ptr<oxlm::Vocabulary>& vocab,
+ bool pos_back_off,
+ const FactorType& pos_factor_type)
+ : posBackOff(pos_back_off), posFactorType(pos_factor_type) {
+ for (int i = 0; i < vocab->size(); ++i) {
+ const string &str = vocab->convert(i);
+ FactorCollection &fc = FactorCollection::Instance();
+ const Moses::Factor *factor = fc.AddFactor(str, false);
+ moses2Oxlm[factor] = i;
+ }
+
+ kUNKNOWN = vocab->convert("<unk>");
+}
+
+int OxLMMapper::convert(const Word& word) const {
+ const Moses::Factor* word_factor = word.GetFactor(0);
+ Coll::const_iterator iter = moses2Oxlm.find(word_factor);
+ if (posBackOff && iter == moses2Oxlm.end()) {
+ const Moses::Factor* pos_factor = word.GetFactor(posFactorType);
+ iter = moses2Oxlm.find(pos_factor);
+ }
+
+ return iter == moses2Oxlm.end() ? kUNKNOWN : iter->second;
+}
+
+void OxLMMapper::convert(
+ const vector<const Word*>& contextFactor,
+ vector<int> &ids, int &word) const {
+ ids.clear();
+ for (size_t i = 0; i < contextFactor.size() - 1; ++i) {
+ ids.push_back(convert(*contextFactor[i]));
+ }
+ std::reverse(ids.begin(), ids.end());
+
+ word = convert(*contextFactor.back());
+}
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/OxLMMapper.h b/moses/LM/oxlm/OxLMMapper.h
new file mode 100644
index 000000000..1aef7af88
--- /dev/null
+++ b/moses/LM/oxlm/OxLMMapper.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <map>
+
+#include "lbl/vocabulary.h"
+
+#include "moses/Factor.h"
+#include "moses/Phrase.h"
+
+namespace Moses {
+
+class OxLMMapper {
+ public:
+ OxLMMapper(
+ const boost::shared_ptr<oxlm::Vocabulary>& vocab,
+ bool pos_back_off,
+ const FactorType& pos_factor_type);
+
+ int convert(const Word& word) const;
+
+ void convert(
+ const std::vector<const Word*> &contextFactor,
+ std::vector<int> &ids,
+ int &word) const;
+
+ protected:
+ bool posBackOff;
+ FactorType posFactorType;
+
+ typedef std::map<const Moses::Factor*, int> Coll;
+ Coll moses2Oxlm;
+ int kUNKNOWN;
+};
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/OxLMParallelMapper.cpp b/moses/LM/oxlm/OxLMParallelMapper.cpp
new file mode 100644
index 000000000..3bfd4be04
--- /dev/null
+++ b/moses/LM/oxlm/OxLMParallelMapper.cpp
@@ -0,0 +1,40 @@
+#include "moses/LM/oxlm/OxLMParallelMapper.h"
+
+#include "lbl/parallel_vocabulary.h"
+
+#include "moses/FactorCollection.h"
+
+using namespace std;
+
+namespace Moses {
+
+OxLMParallelMapper::OxLMParallelMapper(
+ const boost::shared_ptr<oxlm::Vocabulary>& vocab,
+ bool pos_back_off,
+ const FactorType& pos_factor_type)
+ : OxLMMapper(vocab, pos_back_off, pos_factor_type) {
+ boost::shared_ptr<oxlm::ParallelVocabulary> parallel_vocab =
+ dynamic_pointer_cast<oxlm::ParallelVocabulary>(vocab);
+ assert(parallel_vocab != nullptr);
+
+ for (int i = 0; i < parallel_vocab->sourceSize(); ++i) {
+ string word = parallel_vocab->convertSource(i);
+ FactorCollection& fc = FactorCollection::Instance();
+ const Moses::Factor* factor = fc.AddFactor(word, false);
+ moses2SourceOxlm[factor] = i;
+ }
+
+ kSOURCE_UNKNOWN = parallel_vocab->convertSource("<unk>");
+}
+
+int OxLMParallelMapper::convertSource(const Word& word) const {
+ const Moses::Factor* word_factor = word.GetFactor(0);
+ Coll::const_iterator iter = moses2SourceOxlm.find(word_factor);
+ if (posBackOff && iter == moses2SourceOxlm.end()) {
+ const Moses::Factor* pos_factor = word.GetFactor(posFactorType);
+ iter = moses2SourceOxlm.find(pos_factor);
+ }
+ return iter == moses2SourceOxlm.end() ? kSOURCE_UNKNOWN : iter->second;
+}
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/OxLMParallelMapper.h b/moses/LM/oxlm/OxLMParallelMapper.h
new file mode 100644
index 000000000..9fbcfa2a3
--- /dev/null
+++ b/moses/LM/oxlm/OxLMParallelMapper.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "moses/LM/oxlm/OxLMMapper.h"
+
+namespace Moses {
+
+class OxLMParallelMapper : public OxLMMapper {
+ public:
+ OxLMParallelMapper(
+ const boost::shared_ptr<oxlm::Vocabulary>& vocab,
+ bool pos_back_off,
+ const FactorType& pos_factor_type);
+
+ int convertSource(const Word& word) const;
+
+ private:
+ Coll moses2SourceOxlm;
+ int kSOURCE_UNKNOWN;
+};
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/SourceOxLM.cpp b/moses/LM/oxlm/SourceOxLM.cpp
new file mode 100644
index 000000000..4a6991eb2
--- /dev/null
+++ b/moses/LM/oxlm/SourceOxLM.cpp
@@ -0,0 +1,137 @@
+#include "moses/LM/oxlm/SourceOxLM.h"
+
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/filesystem.hpp>
+
+using namespace std;
+using namespace oxlm;
+
+namespace Moses {
+
+SourceOxLM::SourceOxLM(const string &line)
+ : BilingualLM(line), posBackOff(false), posFactorType(1),
+ persistentCache(false), cacheHits(0), totalHits(0) {
+ FactorCollection& factorFactory = FactorCollection::Instance(); // To add null word.
+ const Factor* NULL_factor = factorFactory.AddFactor("<unk>");
+ NULL_word.SetFactor(0, NULL_factor);
+ }
+
+SourceOxLM::~SourceOxLM() {
+ if (persistentCache) {
+ double cache_hit_ratio = 100.0 * cacheHits / totalHits;
+ cerr << "Cache hit ratio: " << cache_hit_ratio << endl;
+ }
+}
+
+float SourceOxLM::Score(
+ vector<int>& source_words,
+ vector<int>& target_words) const {
+ // OxLM expects the context in the following format:
+ // [t_{n-1}, t_{n-2}, ..., t_{n-m}, s_{a_n-sm}, s_{a_n-sm+1}, ..., s_{a_n+sm}]
+ // where n is the index for the current target word, m is the target order,
+ // a_n is t_n's affiliation and sm is the source order.
+ vector<int> context = target_words;
+ int word = context.back();
+ context.pop_back();
+ reverse(context.begin(), context.end());
+ context.insert(context.end(), source_words.begin(), source_words.end());
+
+ float score;
+ if (persistentCache) {
+ if (!cache.get()) {
+ cache.reset(new QueryCache());
+ }
+
+ ++totalHits;
+ NGram query(word, context);
+ pair<double, bool> ret = cache->get(query);
+ if (ret.second) {
+ score = ret.first;
+ ++cacheHits;
+ } else {
+ score = model.getLogProb(word, context);
+ cache->put(query, score);
+ }
+ } else {
+ score = model.getLogProb(word, context);
+ }
+
+ // TODO(pauldb): Return OOV count too.
+ return score;
+}
+
+int SourceOxLM::getNeuralLMId(const Word& word, bool is_source_word) const {
+ return is_source_word ? mapper->convertSource(word) : mapper->convert(word);
+}
+
+const Word& SourceOxLM::getNullWord() const {
+ return NULL_word;
+}
+
+void SourceOxLM::loadModel() {
+ model.load(m_filePath);
+
+ boost::shared_ptr<ModelData> config = model.getConfig();
+ source_ngrams = 2 * config->source_order - 1;
+ target_ngrams = config->ngram_order - 1;
+
+ boost::shared_ptr<Vocabulary> vocab = model.getVocab();
+ mapper = boost::make_shared<OxLMParallelMapper>(
+ vocab, posBackOff, posFactorType);
+}
+
+void SourceOxLM::SetParameter(const string& key, const string& value) {
+ if (key == "persistent-cache") {
+ persistentCache = Scan<bool>(value);
+ } else if (key == "pos-back-off") {
+ posBackOff = Scan<bool>(value);
+ } else if (key == "pos-factor-type") {
+ posFactorType = Scan<FactorType>(value);
+ } else {
+ BilingualLM::SetParameter(key, value);
+ }
+}
+
+void SourceOxLM::InitializeForInput(const InputType& source) {
+ BilingualLM::InitializeForInput(source);
+
+ if (persistentCache) {
+ if (!cache.get()) {
+ cache.reset(new QueryCache());
+ }
+
+ int sentence_id = source.GetTranslationId();
+ string cacheFile = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
+ if (boost::filesystem::exists(cacheFile)) {
+ ifstream fin(cacheFile);
+ boost::archive::binary_iarchive iar(fin);
+ cerr << "Loading n-gram probability cache from " << cacheFile << endl;
+ iar >> *cache;
+ cerr << "Done loading " << cache->size()
+ << " n-gram probabilities..." << endl;
+ } else {
+ cerr << "Cache file not found!" << endl;
+ }
+ }
+}
+
+void SourceOxLM::CleanUpAfterSentenceProcessing(const InputType& source) {
+ // Thread safe: the model cache is thread specific.
+ model.clearCache();
+
+ if (persistentCache) {
+ int sentence_id = source.GetTranslationId();
+ string cacheFile = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
+ ofstream fout(cacheFile);
+ boost::archive::binary_oarchive oar(fout);
+ cerr << "Saving persistent cache to " << cacheFile << endl;
+ oar << *cache;
+ cerr << "Done saving " << cache->size()
+ << " n-gram probabilities..." << endl;
+
+ cache->clear();
+ }
+}
+
+} // namespace Moses
diff --git a/moses/LM/oxlm/SourceOxLM.h b/moses/LM/oxlm/SourceOxLM.h
new file mode 100644
index 000000000..3af48489f
--- /dev/null
+++ b/moses/LM/oxlm/SourceOxLM.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <vector>
+
+#include "lbl/model.h"
+#include "lbl/query_cache.h"
+
+#include "moses/LM/BilingualLM.h"
+#include "moses/LM/oxlm/OxLMParallelMapper.h"
+
+namespace Moses {
+
+class SourceOxLM : public BilingualLM {
+ public:
+ SourceOxLM(const std::string &line);
+
+ ~SourceOxLM();
+
+ private:
+ virtual float Score(
+ std::vector<int>& source_words,
+ std::vector<int>& target_words) const;
+
+ virtual int getNeuralLMId(const Word& word, bool is_source_word) const;
+
+ virtual void loadModel();
+
+ const Word& getNullWord() const;
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ void InitializeForInput(const InputType& source);
+
+ void CleanUpAfterSentenceProcessing(const InputType& source);
+
+ protected:
+ oxlm::SourceFactoredLM model;
+ boost::shared_ptr<OxLMParallelMapper> mapper;
+
+ bool posBackOff;
+ FactorType posFactorType;
+
+ bool persistentCache;
+ mutable boost::thread_specific_ptr<oxlm::QueryCache> cache;
+ mutable int cacheHits, totalHits;
+ Word NULL_word; //Null symbol for hiero
+};
+
+} // namespace Moses
diff --git a/moses-cmd/LatticeMBR.cpp b/moses/LatticeMBR.cpp
index 148b44743..9ea21d5db 100644
--- a/moses-cmd/LatticeMBR.cpp
+++ b/moses/LatticeMBR.cpp
@@ -13,9 +13,8 @@
#include <set>
using namespace std;
-using namespace Moses;
-namespace MosesCmd
+namespace Moses
{
size_t bleu_order = 4;
diff --git a/moses-cmd/LatticeMBR.h b/moses/LatticeMBR.h
index ab8b3cb76..47d6da3c4 100644
--- a/moses-cmd/LatticeMBR.h
+++ b/moses/LatticeMBR.h
@@ -19,7 +19,7 @@
-namespace MosesCmd
+namespace Moses
{
class Edge;
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index ba6d1d362..f061ea640 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -39,9 +39,12 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "TranslationOption.h"
#include "TranslationOptionCollection.h"
#include "Timer.h"
+#include "moses/OutputCollector.h"
#include "moses/FF/DistortionScoreProducer.h"
#include "moses/LM/Base.h"
#include "moses/TranslationModel/PhraseDictionary.h"
+#include "moses/TranslationAnalysis.h"
+#include "moses/HypergraphOutput.h"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
@@ -54,13 +57,12 @@ using namespace std;
namespace Moses
{
-Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm)
- :m_transOptColl(source.CreateTranslationOptionCollection())
+Manager::Manager(InputType const& source, SearchAlgorithm searchAlgorithm)
+ :BaseManager(source)
+ ,m_transOptColl(source.CreateTranslationOptionCollection())
,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
,interrupted_flag(0)
,m_hypoId(0)
- ,m_lineNumber(lineNumber)
- ,m_source(source)
{
StaticData::Instance().InitializeForInput(m_source);
}
@@ -78,7 +80,7 @@ Manager::~Manager()
* Main decoder loop that translates a sentence by expanding
* hypotheses stack by stack, until the end of the sentence.
*/
-void Manager::ProcessSentence()
+void Manager::Decode()
{
// initialize statistics
ResetSentenceStats(m_source);
@@ -105,15 +107,17 @@ void Manager::ProcessSentence()
// some reporting on how long this took
IFVERBOSE(1) {
GetSentenceStats().StopTimeCollectOpts();
- TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took " << GetSentenceStats().GetTimeCollectOpts() << " seconds" << endl);
+ TRACE_ERR("Line "<< m_source.GetTranslationId() << ": Collecting options took "
+ << GetSentenceStats().GetTimeCollectOpts() << " seconds at "
+ << __FILE__ << ":" << __LINE__ << endl);
}
// search for best translation with the specified algorithm
Timer searchTime;
searchTime.start();
m_search->ProcessSentence();
- VERBOSE(1, "Line " << m_lineNumber << ": Search took " << searchTime << " seconds" << endl);
- IFVERBOSE(2) {
+ VERBOSE(1, "Line " << m_source.GetTranslationId() << ": Search took " << searchTime << " seconds" << endl);
+ IFVERBOSE(2) {
GetSentenceStats().StopTimeTotal();
TRACE_ERR(GetSentenceStats());
}
@@ -182,11 +186,11 @@ void Manager::printDivergentHypothesis(long translationId, const Hypothesis* hyp
}
-void
+void
Manager::
-printThisHypothesis(long translationId, const Hypothesis* hypo,
- const vector <const TargetPhrase*> & remainingPhrases,
- float remainingScore, ostream& outputStream) const
+printThisHypothesis(long translationId, const Hypothesis* hypo,
+ const vector <const TargetPhrase*> & remainingPhrases,
+ float remainingScore, ostream& outputStream) const
{
outputStream << translationId << " ||| ";
@@ -325,12 +329,12 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
if (i->forward >= 0) {
map<int,const Hypothesis*>::const_iterator idToHypIter = idToHyp.find(i->forward);
UTIL_THROW_IF2(idToHypIter == idToHyp.end(),
- "Couldn't find hypothesis " << i->forward);
+ "Couldn't find hypothesis " << i->forward);
const Hypothesis* nextHypo = idToHypIter->second;
outgoingHyps[hypo].insert(nextHypo);
map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
UTIL_THROW_IF2(fscoreIter == fscores.end(),
- "Couldn't find scores for hypothsis " << nextHypo->GetId());
+ "Couldn't find scores for hypothsis " << nextHypo->GetId());
edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
i->fscore - fscoreIter->second;
}
@@ -348,17 +352,17 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
outgoingHyps.find(i->hypo);
UTIL_THROW_IF2(outIter == outgoingHyps.end(),
- "Couldn't find hypothesis " << i->hypo->GetId());
+ "Couldn't find hypothesis " << i->hypo->GetId());
float sigma = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
j != outIter->second.end(); ++j) {
map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
UTIL_THROW_IF2(succIter == sigmas.end(),
- "Couldn't find hypothesis " << (*j)->GetId());
+ "Couldn't find hypothesis " << (*j)->GetId());
map<Edge,float>::const_iterator edgeScoreIter =
edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
UTIL_THROW_IF2(edgeScoreIter == edgeScores.end(),
- "Couldn't find edge for hypothesis " << (*j)->GetId());
+ "Couldn't find edge for hypothesis " << (*j)->GetId());
float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
if (sigma == 0) {
sigma = term;
@@ -391,10 +395,10 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
j != outIter->second.end(); ++j) {
candidates.push_back(*j);
UTIL_THROW_IF2(sigmas.find(*j) == sigmas.end(),
- "Hypothesis " << (*j)->GetId() << " not found");
+ "Hypothesis " << (*j)->GetId() << " not found");
Edge edge(path.back()->GetId(),(*j)->GetId());
UTIL_THROW_IF2(edgeScores.find(edge) == edgeScores.end(),
- "Edge not found");
+ "Edge not found");
candidateScores.push_back(sigmas[*j] + edgeScores[edge]);
if (scoreTotal == 0) {
scoreTotal = candidateScores.back();
@@ -469,7 +473,7 @@ void Manager::CalcDecoderStatistics() const
}
}
-void OutputWordGraph(std::ostream &outputWordGraphStream, const Hypothesis *hypo, size_t &linkId)
+void Manager::OutputWordGraph(std::ostream &outputWordGraphStream, const Hypothesis *hypo, size_t &linkId) const
{
const Hypothesis *prevHypo = hypo->GetPrevHypo();
@@ -549,14 +553,13 @@ void OutputWordGraph(std::ostream &outputWordGraphStream, const Hypothesis *hypo
outputWordGraphStream << endl;
}
-void Manager::GetOutputLanguageModelOrder( std::ostream &out, const Hypothesis *hypo )
-{
+void Manager::GetOutputLanguageModelOrder( std::ostream &out, const Hypothesis *hypo ) {
Phrase translation;
hypo->GetOutputPhrase(translation);
const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (size_t i = 0; i < statefulFFs.size(); ++i) {
const StatefulFeatureFunction *ff = statefulFFs[i];
- if (const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff)) {
+ if (const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff)) {
lm->ReportHistoryOrder(out, translation);
}
}
@@ -565,8 +568,19 @@ void Manager::GetOutputLanguageModelOrder( std::ostream &out, const Hypothesis *
void Manager::GetWordGraph(long translationId, std::ostream &outputWordGraphStream) const
{
const StaticData &staticData = StaticData::Instance();
- string fileName = staticData.GetParam("output-word-graph")[0];
- bool outputNBest = Scan<bool>(staticData.GetParam("output-word-graph")[1]);
+ const PARAM_VEC *params;
+
+ string fileName;
+ bool outputNBest = false;
+ params = staticData.GetParameter().GetParam("output-word-graph");
+ if (params && params->size()) {
+ fileName = params->at(0);
+
+ if (params->size() == 2) {
+ outputNBest = Scan<bool>(params->at(1));
+ }
+ }
+
const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
outputWordGraphStream << "VERSION=1.0" << endl
@@ -756,18 +770,12 @@ void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostr
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
-
- const vector<const StatelessFeatureFunction*>& slf =StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- size_t featureIndex = 1;
- for (size_t i = 0; i < sff.size(); ++i) {
- featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream);
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- {
- featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream);
- }
+ ScoreComponentCollection scores = hypo->GetScoreBreakdown();
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
+ if (prevHypo) {
+ scores.MinusEquals(prevHypo->GetScoreBreakdown());
}
+ scores.Save(outputSearchGraphStream, false);
}
@@ -832,35 +840,11 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
// }
}
-size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
-{
- if (!ff->IsTuneable()) {
- return index;
- }
- ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
- const Hypothesis *prevHypo = hypo->GetPrevHypo();
- if (prevHypo) {
- scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
- }
- vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
- size_t numScoreComps = featureValues.size();
-
- if (numScoreComps > 1) {
- for (size_t i = 0; i < numScoreComps; ++i) {
- outputSearchGraphStream << ff->GetScoreProducerDescription() << i << "=" << featureValues[i] << " ";
- }
- } else {
- outputSearchGraphStream << ff->GetScoreProducerDescription() << "=" << featureValues[0] << " ";
- }
-
- return index+numScoreComps;
-}
-
/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
-void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
+void Manager::OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const
{
- VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << translationId << std::endl)
+ VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << m_source.GetTranslationId() << std::endl)
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
@@ -871,7 +855,7 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
set<int> terminalNodes;
multimap<int,int> hypergraphIDToArcs;
- VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << translationId << std::endl)
+ VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << m_source.GetTranslationId() << std::endl)
long numNodes = 0;
long endNode = 0;
@@ -933,15 +917,15 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
// Print number of nodes and arcs
outputSearchGraphStream << numNodes << " " << numArcs << endl;
- VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId
+ VERBOSE(2,"Search graph to output as hypergraph for sentence " << m_source.GetTranslationId()
<< " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
- VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl)
+ VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << m_source.GetTranslationId() << std::endl)
for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
if (hypergraphHypothesisID % 100000 == 0) {
- VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << translationId << std::endl);
+ VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << m_source.GetTranslationId() << std::endl);
}
// int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
@@ -964,7 +948,7 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
// int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
UTIL_THROW_IF2(
(hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
- "Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
+ "Error while writing search lattice as hypergraph for sentence " << m_source.GetTranslationId() << ". " <<
"Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
". There are " << numNodes << " nodes in the search lattice."
@@ -979,7 +963,7 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
// VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
UTIL_THROW_IF2(
(startNode >= hypergraphHypothesisID),
- "Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
+ "Error while writing search lattice as hypergraph for sentence" << m_source.GetTranslationId() << ". " <<
"The nodes must be output in topological order. The code attempted to violate this restriction."
);
@@ -1344,7 +1328,7 @@ void Manager::SerializeSearchGraphPB(
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
const Hypothesis *loserHypo = *iterArcList;
UTIL_THROW_IF2(!connected[loserHypo->GetId()],
- "Hypothesis " << loserHypo->GetId() << " is not connected");
+ "Hypothesis " << loserHypo->GetId() << " is not connected");
Hypergraph_Edge* edge = hg.add_edges();
SerializeEdgeInfo(loserHypo, edge);
edge->set_head_node(headNodeIdx);
@@ -1466,4 +1450,374 @@ SentenceStats& Manager::GetSentenceStats() const
}
+void Manager::OutputNBest(OutputCollector *collector) const
+{
+ const StaticData &staticData = StaticData::Instance();
+
+ if (collector && !staticData.UseLatticeMBR()) {
+ TrellisPathList nBestList;
+ ostringstream out;
+ CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest());
+ OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_source.GetTranslationId(),
+ staticData.GetReportSegmentation());
+ collector->Write(m_source.GetTranslationId(), out.str());
+ }
+
+}
+
+void Manager::OutputNBest(std::ostream& out
+ , const Moses::TrellisPathList &nBestList
+ , const std::vector<Moses::FactorType>& outputFactorOrder
+ , long translationId
+ , char reportSegmentation) const
+{
+ const StaticData &staticData = StaticData::Instance();
+ bool reportAllFactors = staticData.GetReportAllFactorsNBest();
+ bool includeSegmentation = staticData.NBestIncludesSegmentation();
+ bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
+
+ TrellisPathList::const_iterator iter;
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
+ const TrellisPath &path = **iter;
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
+
+ // print the surface factor of the translation
+ out << translationId << " ||| ";
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ OutputSurface(out, edge, outputFactorOrder, reportSegmentation, reportAllFactors);
+ }
+ out << " |||";
+
+ // print scores with feature names
+ OutputAllFeatureScores(path.GetScoreBreakdown(), out );
+
+ // total
+ out << " ||| " << path.GetTotalScore();
+
+ //phrase-to-phrase segmentation
+ if (includeSegmentation) {
+ out << " |||";
+ for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
+ WordsRange targetRange = path.GetTargetWordsRange(edge);
+ out << " " << sourceRange.GetStartPos();
+ if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
+ out << "-" << sourceRange.GetEndPos();
+ }
+ out<< "=" << targetRange.GetStartPos();
+ if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
+ out<< "-" << targetRange.GetEndPos();
+ }
+ }
+ }
+
+ if (includeWordAlignment) {
+ out << " ||| ";
+ for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
+ WordsRange targetRange = path.GetTargetWordsRange(edge);
+ const int sourceOffset = sourceRange.GetStartPos();
+ const int targetOffset = targetRange.GetStartPos();
+ const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
+
+ OutputAlignment(out, ai, sourceOffset, targetOffset);
+
+ }
+ }
+
+ if (StaticData::Instance().IsPathRecoveryEnabled()) {
+ out << " ||| ";
+ OutputInput(out, edges[0]);
+ }
+
+ out << endl;
+ }
+
+ out << std::flush;
}
+
+//////////////////////////////////////////////////////////////////////////
+/***
+ * print surface factor only for the given phrase
+ */
+void Manager::OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
+ char reportSegmentation, bool reportAllFactors) const
+{
+ UTIL_THROW_IF2(outputFactorOrder.size() == 0,
+ "Must specific at least 1 output factor");
+ const TargetPhrase& phrase = edge.GetCurrTargetPhrase();
+ bool markUnknown = StaticData::Instance().GetMarkUnknown();
+ if (reportAllFactors == true) {
+ out << phrase;
+ } else {
+ FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
+
+ std::map<size_t, const Factor*> placeholders;
+ if (placeholderFactor != NOT_FOUND) {
+ // creates map of target position -> factor for placeholders
+ placeholders = GetPlaceholders(edge, placeholderFactor);
+ }
+
+ size_t size = phrase.GetSize();
+ for (size_t pos = 0 ; pos < size ; pos++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
+
+ if (placeholders.size()) {
+ // do placeholders
+ std::map<size_t, const Factor*>::const_iterator iter = placeholders.find(pos);
+ if (iter != placeholders.end()) {
+ factor = iter->second;
+ }
+ }
+
+ UTIL_THROW_IF2(factor == NULL,
+ "No factor 0 at position " << pos);
+
+ //preface surface form with UNK if marking unknowns
+ const Word &word = phrase.GetWord(pos);
+ if(markUnknown && word.IsOOV()) {
+ out << "UNK" << *factor;
+ } else {
+ out << *factor;
+ }
+
+ for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
+ UTIL_THROW_IF2(factor == NULL,
+ "No factor " << i << " at position " << pos);
+
+ out << "|" << *factor;
+ }
+ out << " ";
+ }
+ }
+
+ // trace ("report segmentation") option "-t" / "-tt"
+ if (reportSegmentation > 0 && phrase.GetSize() > 0) {
+ const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
+ const int sourceStart = sourceRange.GetStartPos();
+ const int sourceEnd = sourceRange.GetEndPos();
+ out << "|" << sourceStart << "-" << sourceEnd; // enriched "-tt"
+ if (reportSegmentation == 2) {
+ out << ",wa=";
+ const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
+ OutputAlignment(out, ai, 0, 0);
+ out << ",total=";
+ out << edge.GetScore() - edge.GetPrevHypo()->GetScore();
+ out << ",";
+ ScoreComponentCollection scoreBreakdown(edge.GetScoreBreakdown());
+ scoreBreakdown.MinusEquals(edge.GetPrevHypo()->GetScoreBreakdown());
+ OutputAllFeatureScores(scoreBreakdown, out);
+ }
+ out << "| ";
+ }
+}
+
+void Manager::OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset) const
+{
+ typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
+ AlignVec alignments = ai.GetSortedAlignments();
+
+ AlignVec::const_iterator it;
+ for (it = alignments.begin(); it != alignments.end(); ++it) {
+ const std::pair<size_t,size_t> &alignment = **it;
+ out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
+ }
+
+}
+
+void Manager::OutputInput(std::ostream& os, const Hypothesis* hypo) const
+{
+ size_t len = hypo->GetInput().GetSize();
+ std::vector<const Phrase*> inp_phrases(len, 0);
+ OutputInput(inp_phrases, hypo);
+ for (size_t i=0; i<len; ++i)
+ if (inp_phrases[i]) os << *inp_phrases[i];
+}
+
+void Manager::OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo) const
+{
+ if (hypo->GetPrevHypo()) {
+ OutputInput(map, hypo->GetPrevHypo());
+ map[hypo->GetCurrSourceWordsRange().GetStartPos()] = &hypo->GetTranslationOption().GetInputPath().GetPhrase();
+ }
+}
+
+std::map<size_t, const Factor*> Manager::GetPlaceholders(const Hypothesis &hypo, FactorType placeholderFactor) const
+{
+ const InputPath &inputPath = hypo.GetTranslationOption().GetInputPath();
+ const Phrase &inputPhrase = inputPath.GetPhrase();
+
+ std::map<size_t, const Factor*> ret;
+
+ for (size_t sourcePos = 0; sourcePos < inputPhrase.GetSize(); ++sourcePos) {
+ const Factor *factor = inputPhrase.GetFactor(sourcePos, placeholderFactor);
+ if (factor) {
+ std::set<size_t> targetPos = hypo.GetTranslationOption().GetTargetPhrase().GetAlignTerm().GetAlignmentsForSource(sourcePos);
+ UTIL_THROW_IF2(targetPos.size() != 1,
+ "Placeholder should be aligned to 1, and only 1, word");
+ ret[*targetPos.begin()] = factor;
+ }
+ }
+
+ return ret;
+}
+
+void Manager::OutputLatticeSamples(OutputCollector *collector) const
+{
+ const StaticData &staticData = StaticData::Instance();
+ if (collector) {
+ TrellisPathList latticeSamples;
+ ostringstream out;
+ CalcLatticeSamples(staticData.GetLatticeSamplesSize(), latticeSamples);
+ OutputNBest(out,latticeSamples, staticData.GetOutputFactorOrder(), m_source.GetTranslationId(),
+ staticData.GetReportSegmentation());
+ collector->Write(m_source.GetTranslationId(), out.str());
+ }
+
+}
+
+void Manager::OutputAlignment(OutputCollector *collector) const
+{
+ if (collector) {
+ std::vector<const Hypothesis *> edges;
+ const Hypothesis *currentHypo = GetBestHypothesis();
+ while (currentHypo) {
+ edges.push_back(currentHypo);
+ currentHypo = currentHypo->GetPrevHypo();
+ }
+
+ OutputAlignment(collector,m_source.GetTranslationId(), edges);
+ }
+}
+
+void Manager::OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges) const
+{
+ ostringstream out;
+ OutputAlignment(out, edges);
+
+ collector->Write(lineNo,out.str());
+}
+
+void Manager::OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges) const
+{
+ size_t targetOffset = 0;
+
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+ const Hypothesis &edge = *edges[currEdge];
+ const TargetPhrase &tp = edge.GetCurrTargetPhrase();
+ size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
+
+ OutputAlignment(out, tp.GetAlignTerm(), sourceOffset, targetOffset);
+
+ targetOffset += tp.GetSize();
+ }
+ // Removing std::endl here breaks -alignment-output-file, so stop doing that, please :)
+ // Or fix it somewhere else.
+ out << std::endl;
+}
+
+void Manager::OutputDetailedTranslationReport(OutputCollector *collector) const
+{
+ if (collector) {
+ ostringstream out;
+ FixPrecision(out,PRECISION);
+ TranslationAnalysis::PrintTranslationAnalysis(out, GetBestHypothesis());
+ collector->Write(m_source.GetTranslationId(),out.str());
+ }
+
+}
+
+void Manager::OutputUnknowns(OutputCollector *collector) const
+{
+ if (collector) {
+ long translationId = m_source.GetTranslationId();
+ const vector<const Phrase*>& unknowns = m_transOptColl->GetUnknownSources();
+ ostringstream out;
+ for (size_t i = 0; i < unknowns.size(); ++i) {
+ out << *(unknowns[i]);
+ }
+ out << endl;
+ collector->Write(translationId, out.str());
+ }
+
+}
+
+void Manager::OutputWordGraph(OutputCollector *collector) const
+{
+ if (collector) {
+ long translationId = m_source.GetTranslationId();
+ ostringstream out;
+ FixPrecision(out,PRECISION);
+ GetWordGraph(translationId, out);
+ collector->Write(translationId, out.str());
+ }
+}
+
+void Manager::OutputSearchGraph(OutputCollector *collector) const
+{
+ if (collector) {
+ long translationId = m_source.GetTranslationId();
+ ostringstream out;
+ FixPrecision(out,PRECISION);
+ OutputSearchGraph(translationId, out);
+ collector->Write(translationId, out.str());
+
+#ifdef HAVE_PROTOBUF
+ const StaticData &staticData = StaticData::Instance();
+ if (staticData.GetOutputSearchGraphPB()) {
+ ostringstream sfn;
+ sfn << staticData.GetParam("output-search-graph-pb")[0] << '/' << translationId << ".pb" << ends;
+ string fn = sfn.str();
+ VERBOSE(2, "Writing search graph to " << fn << endl);
+ fstream output(fn.c_str(), ios::trunc | ios::binary | ios::out);
+ SerializeSearchGraphPB(translationId, output);
+ }
+#endif
+ }
+
+}
+
+void Manager::OutputSearchGraphSLF() const
+{
+ const StaticData &staticData = StaticData::Instance();
+ long translationId = m_source.GetTranslationId();
+
+ // Output search graph in HTK standard lattice format (SLF)
+ bool slf = staticData.GetOutputSearchGraphSLF();
+ if (slf) {
+ stringstream fileName;
+
+ string dir;
+ staticData.GetParameter().SetParameter<string>(dir, "output-search-graph-slf", "");
+
+ fileName << dir << "/" << translationId << ".slf";
+ ofstream *file = new ofstream;
+ file->open(fileName.str().c_str());
+ if (file->is_open() && file->good()) {
+ ostringstream out;
+ FixPrecision(out,PRECISION);
+ OutputSearchGraphAsSLF(translationId, out);
+ *file << out.str();
+ file -> flush();
+ } else {
+ TRACE_ERR("Cannot output HTK standard lattice for line " << translationId << " because the output file is not open or not ready for writing" << endl);
+ }
+ delete file;
+ }
+
+}
+
+void Manager::OutputSearchGraphHypergraph() const
+{
+ const StaticData &staticData = StaticData::Instance();
+ if (staticData.GetOutputSearchGraphHypergraph()) {
+ HypergraphOutput<Manager> hypergraphOutput(PRECISION);
+ hypergraphOutput.Write(*this);
+ }
+}
+
+} // namespace
diff --git a/moses/Manager.h b/moses/Manager.h
index ccf57c527..8e948c9c2 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -34,6 +34,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "WordsBitmap.h"
#include "Search.h"
#include "SearchCubePruning.h"
+#include "BaseManager.h"
namespace Moses
{
@@ -91,7 +92,7 @@ struct SearchGraphNode {
* the appropriate stack, or re-combined with existing hypotheses
**/
-class Manager
+class Manager : public BaseManager
{
Manager();
Manager(Manager const&);
@@ -106,12 +107,10 @@ private:
// Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
- size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
protected:
// data
-// InputType const& m_source; /**< source sentence to be translated */
TranslationOptionCollection *m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */
Search *m_search;
@@ -119,7 +118,6 @@ protected:
size_t interrupted_flag;
std::auto_ptr<SentenceStats> m_sentenceStats;
int m_hypoId; //used to number the hypos as they are created.
- size_t m_lineNumber;
void GetConnectedGraph(
std::map< int, bool >* pConnected,
@@ -128,14 +126,30 @@ protected:
std::map< int, bool >* pConnected,
std::vector< const Hypothesis* >* pConnectedList) const;
+ // output
+ // nbest
+ void OutputNBest(std::ostream& out
+ , const Moses::TrellisPathList &nBestList
+ , const std::vector<Moses::FactorType>& outputFactorOrder
+ , long translationId
+ , char reportSegmentation) const;
+ void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
+ char reportSegmentation, bool reportAllFactors) const;
+ void OutputAlignment(std::ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset) const;
+ void OutputInput(std::ostream& os, const Hypothesis* hypo) const;
+ void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo) const;
+ std::map<size_t, const Factor*> GetPlaceholders(const Hypothesis &hypo, FactorType placeholderFactor) const;
+ void OutputAlignment(OutputCollector* collector, size_t lineNo , const std::vector<const Hypothesis *> &edges) const;
+ void OutputAlignment(std::ostream &out, const std::vector<const Hypothesis *> &edges) const;
+
+ void OutputWordGraph(std::ostream &outputWordGraphStream, const Hypothesis *hypo, size_t &linkId) const;
public:
- InputType const& m_source; /**< source sentence to be translated */
- Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm);
+ Manager(InputType const& source, SearchAlgorithm searchAlgorithm);
~Manager();
const TranslationOptionCollection* getSntTranslationOptions();
- void ProcessSentence();
+ void Decode();
const Hypothesis *GetBestHypothesis() const;
const Hypothesis *GetActualBestHypothesis() const;
void CalcNBest(size_t count, TrellisPathList &ret,bool onlyDistinct=0) const;
@@ -146,13 +160,14 @@ public:
void GetOutputLanguageModelOrder( std::ostream &out, const Hypothesis *hypo );
void GetWordGraph(long translationId, std::ostream &outputWordGraphStream) const;
int GetNextHypoId();
+
#ifdef HAVE_PROTOBUF
void SerializeSearchGraphPB(long translationId, std::ostream& outputStream) const;
#endif
void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
- void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const;
+ void OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const;
void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
const InputType& GetSource() const {
return m_source;
@@ -171,6 +186,19 @@ public:
void GetForwardBackwardSearchGraph(std::map< int, bool >* pConnected,
std::vector< const Hypothesis* >* pConnectedList, std::map < const Hypothesis*, std::set < const Hypothesis* > >* pOutgoingHyps, std::vector< float>* pFwdBwdScores) const;
+ // outputs
+ void OutputNBest(OutputCollector *collector) const;
+ void OutputAlignment(OutputCollector *collector) const;
+ void OutputLatticeSamples(OutputCollector *collector) const;
+ void OutputDetailedTranslationReport(OutputCollector *collector) const;
+ void OutputUnknowns(OutputCollector *collector) const;
+ void OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const
+ {}
+ void OutputWordGraph(OutputCollector *collector) const;
+ void OutputSearchGraph(OutputCollector *collector) const;
+ void OutputSearchGraphSLF() const;
+ void OutputSearchGraphHypergraph() const;
+
};
}
diff --git a/moses/MockHypothesis.cpp b/moses/MockHypothesis.cpp
index 81fcb24b8..c18b58a5e 100644
--- a/moses/MockHypothesis.cpp
+++ b/moses/MockHypothesis.cpp
@@ -41,7 +41,7 @@ MockHypothesisGuard::MockHypothesisGuard(
m_wp("WordPenalty"),
m_uwp("UnknownWordPenalty"),
m_dist("Distortion"),
- m_manager(0,m_sentence,Normal)
+ m_manager(m_sentence,Normal)
{
BOOST_CHECK_EQUAL(alignments.size(), targetSegments.size());
@@ -62,7 +62,7 @@ MockHypothesisGuard::MockHypothesisGuard(
for (; ti != targetSegments.end() && ai != alignments.end(); ++ti,++ai) {
Hypothesis* prevHypo = m_hypothesis;
WordsRange wordsRange(ai->first,ai->second);
- m_targetPhrases.push_back(TargetPhrase());
+ m_targetPhrases.push_back(TargetPhrase(NULL));
// m_targetPhrases.back().CreateFromString(Input, factors, *ti, "|", NULL);
m_targetPhrases.back().CreateFromString(Input, factors, *ti, NULL);
m_toptions.push_back(new TranslationOption
diff --git a/moses/PDTAimp.cpp b/moses/PDTAimp.cpp
new file mode 100644
index 000000000..34f65da4c
--- /dev/null
+++ b/moses/PDTAimp.cpp
@@ -0,0 +1,462 @@
+#include "PDTAimp.h"
+
+namespace Moses
+{
+
+PDTAimp::PDTAimp(PhraseDictionaryTreeAdaptor *p)
+ : m_dict(0),
+ m_obj(p),
+ useCache(1),
+ totalE(0),
+ distinctE(0) {
+ m_numInputScores = 0;
+ m_inputFeature = &InputFeature::Instance();
+
+ if (m_inputFeature) {
+ const PhraseDictionary *firstPt = PhraseDictionary::GetColl()[0];
+ if (firstPt == m_obj) {
+ m_numInputScores = m_inputFeature->GetNumScoreComponents();
+ }
+ }
+}
+
+PDTAimp::~PDTAimp() {
+ CleanUp();
+ delete m_dict;
+
+ if (StaticData::Instance().GetVerboseLevel() >= 2) {
+
+ TRACE_ERR("tgt candidates stats: total="<<totalE<<"; distinct="
+ <<distinctE<<" ("<<distinctE/(0.01*totalE)<<"); duplicates="
+ <<totalE-distinctE<<" ("<<(totalE-distinctE)/(0.01*totalE)
+ <<")\n");
+
+ TRACE_ERR("\npath statistics\n");
+
+ if(path1Best.size()) {
+ TRACE_ERR("1-best: ");
+ std::copy(path1Best.begin()+1,path1Best.end(),
+ std::ostream_iterator<size_t>(std::cerr," \t"));
+ TRACE_ERR("\n");
+ }
+ if(pathCN.size()) {
+ TRACE_ERR("CN (full): ");
+ std::transform(pathCN.begin()+1
+ ,pathCN.end()
+ ,std::ostream_iterator<double>(std::cerr," \t")
+ ,Exp);
+ TRACE_ERR("\n");
+ }
+ if(pathExplored.size()) {
+ TRACE_ERR("CN (explored): ");
+ std::copy(pathExplored.begin()+1,pathExplored.end(),
+ std::ostream_iterator<size_t>(std::cerr," \t"));
+ TRACE_ERR("\n");
+ }
+ }
+
+}
+
+void PDTAimp::CleanUp() {
+ assert(m_dict);
+ m_dict->FreeMemory();
+ for(size_t i=0; i<m_tgtColls.size(); ++i) delete m_tgtColls[i];
+ m_tgtColls.clear();
+ m_cache.clear();
+ m_rangeCache.clear();
+ uniqSrcPhr.clear();
+}
+
+TargetPhraseCollectionWithSourcePhrase const*
+PDTAimp::GetTargetPhraseCollection(Phrase const &src) const {
+
+ assert(m_dict);
+ if(src.GetSize()==0) return 0;
+
+ std::pair<MapSrc2Tgt::iterator,bool> piter;
+ if(useCache) {
+ piter=m_cache.insert(std::make_pair(src,static_cast<TargetPhraseCollectionWithSourcePhrase const*>(0)));
+ if(!piter.second) return piter.first->second;
+ } else if (m_cache.size()) {
+ MapSrc2Tgt::const_iterator i=m_cache.find(src);
+ return (i!=m_cache.end() ? i->second : 0);
+ }
+
+ std::vector<std::string> srcString(src.GetSize());
+ // convert source Phrase into vector of strings
+ for(size_t i=0; i<srcString.size(); ++i) {
+ Factors2String(src.GetWord(i),srcString[i]);
+ }
+
+ // get target phrases in string representation
+ std::vector<StringTgtCand> cands;
+ std::vector<std::string> wacands;
+ m_dict->GetTargetCandidates(srcString,cands,wacands);
+ if(cands.empty()) {
+ return 0;
+ }
+
+ //TODO: Multiple models broken here
+ std::vector<float> weights = StaticData::Instance().GetWeights(m_obj);
+
+ std::vector<TargetPhrase> tCands;
+ tCands.reserve(cands.size());
+
+ std::vector<std::pair<float,size_t> > costs;
+ costs.reserve(cands.size());
+
+ std::vector<Phrase> sourcePhrases;
+ sourcePhrases.reserve(cands.size());
+
+
+ // convert into TargetPhrases
+ for(size_t i=0; i<cands.size(); ++i) {
+ TargetPhrase targetPhrase(m_obj);
+
+ StringTgtCand::Tokens const& factorStrings=cands[i].tokens;
+ Scores const& probVector=cands[i].scores;
+
+ std::vector<float> scoreVector(probVector.size());
+ std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
+ TransformScore);
+ std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
+ FloorScore);
+
+ //sparse features.
+ //These are already in log-space
+ for (size_t j = 0; j < cands[i].fnames.size(); ++j) {
+ targetPhrase.GetScoreBreakdown().Assign(m_obj, *cands[i].fnames[j], cands[i].fvalues[j]);
+ }
+
+ CreateTargetPhrase(targetPhrase,factorStrings,scoreVector, Scores(0), &wacands[i], &src);
+
+ costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
+ tCands.push_back(targetPhrase);
+
+ sourcePhrases.push_back(src);
+ }
+
+ TargetPhraseCollectionWithSourcePhrase *rv;
+ rv=PruneTargetCandidates(tCands,costs, sourcePhrases);
+ if(rv->IsEmpty()) {
+ delete rv;
+ return 0;
+ } else {
+ if(useCache) piter.first->second=rv;
+ m_tgtColls.push_back(rv);
+ return rv;
+ }
+
+}
+
+void PDTAimp::Create(const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &filePath
+ , const std::vector<float> &weight
+ ) {
+
+ // set my members
+ m_dict=new PhraseDictionaryTree();
+ m_input=input;
+ m_output=output;
+
+ const StaticData &staticData = StaticData::Instance();
+ m_dict->NeedAlignmentInfo(staticData.NeedAlignmentInfo());
+
+ std::string binFname=filePath+".binphr.idx";
+ if(!FileExists(binFname.c_str())) {
+ UTIL_THROW2( "bin ttable does not exist");
+ //TRACE_ERR( "bin ttable does not exist -> create it\n");
+ //InputFileStream in(filePath);
+ //m_dict->Create(in,filePath);
+ }
+ VERBOSE(1,"reading bin ttable\n");
+// m_dict->Read(filePath);
+ bool res=m_dict->Read(filePath);
+ if (!res) {
+ std::stringstream strme;
+ strme << "bin ttable was read in a wrong way\n";
+ UserMessage::Add(strme.str());
+ exit(1);
+ }
+}
+
+
+void PDTAimp::CacheSource(ConfusionNet const& src) {
+ assert(m_dict);
+ const size_t srcSize=src.GetSize();
+
+ std::vector<size_t> exploredPaths(srcSize+1,0);
+ std::vector<double> exPathsD(srcSize+1,-1.0);
+
+ // collect some statistics
+ std::vector<size_t> cnDepths(srcSize,0);
+ for(size_t i=0; i<srcSize; ++i) cnDepths[i]=src[i].size();
+
+ for(size_t len=1; len<=srcSize; ++len)
+ for(size_t i=0; i<=srcSize-len; ++i) {
+ double pd=0.0;
+ for(size_t k=i; k<i+len; ++k) pd+=log(1.0*cnDepths[k]);
+ exPathsD[len]=(exPathsD[len]>=0.0 ? addLogScale(pd,exPathsD[len]) : pd);
+ }
+
+ // update global statistics
+ if(pathCN.size()<=srcSize) pathCN.resize(srcSize+1,-1.0);
+ for(size_t len=1; len<=srcSize; ++len)
+ pathCN[len]=pathCN[len]>=0.0 ? addLogScale(pathCN[len],exPathsD[len]) : exPathsD[len];
+
+ if(path1Best.size()<=srcSize) path1Best.resize(srcSize+1,0);
+ for(size_t len=1; len<=srcSize; ++len) path1Best[len]+=srcSize-len+1;
+
+
+ if (StaticData::Instance().GetVerboseLevel() >= 2 && exPathsD.size()) {
+ TRACE_ERR("path stats for current CN: \nCN (full): ");
+ std::transform(exPathsD.begin()+1
+ ,exPathsD.end()
+ ,std::ostream_iterator<double>(std::cerr," ")
+ ,Exp);
+ TRACE_ERR("\n");
+ }
+
+ typedef StringTgtCand::Tokens sPhrase;
+ typedef std::map<StringTgtCand::Tokens,TScores> E2Costs;
+
+ std::map<Range,E2Costs> cov2cand;
+ std::vector<State> stack;
+ for(Position i=0 ; i < srcSize ; ++i)
+ stack.push_back(State(i, i, m_dict->GetRoot(), std::vector<float>(m_numInputScores,0.0)));
+
+ std::vector<float> weightTrans = StaticData::Instance().GetWeights(m_obj);
+ std::vector<float> weightInput = StaticData::Instance().GetWeights(m_inputFeature);
+ float weightWP = StaticData::Instance().GetWeightWordPenalty();
+
+ while(!stack.empty()) {
+ State curr(stack.back());
+ stack.pop_back();
+
+ UTIL_THROW_IF2(curr.end() >= srcSize, "Error");
+ const ConfusionNet::Column &currCol=src[curr.end()];
+ // in a given column, loop over all possibilities
+ for(size_t colidx=0; colidx<currCol.size(); ++colidx) {
+ const Word& w=currCol[colidx].first; // w=the i^th possibility in column colidx
+ std::string s;
+ Factors2String(w,s);
+ bool isEpsilon=(s=="" || s==EPSILON);
+
+ //assert that we have the right number of link params in this CN option
+ UTIL_THROW_IF2(currCol[colidx].second.denseScores.size() < m_numInputScores,
+ "Incorrect number of input scores");
+
+ // do not start with epsilon (except at first position)
+ if(isEpsilon && curr.begin()==curr.end() && curr.begin()>0) continue;
+
+ // At a given node in the prefix tree, look to see if w defines an edge to
+ // another node (Extend). Stay at the same node if w==EPSILON
+ PPtr nextP = (isEpsilon ? curr.ptr : m_dict->Extend(curr.ptr,s));
+
+ if(nextP) { // w is a word that should be considered
+ Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx));
+
+ //add together the link scores from the current state and the new arc
+ float inputScoreSum = 0;
+ std::vector<float> newInputScores(m_numInputScores,0.0);
+ if (m_numInputScores) {
+ std::transform(currCol[colidx].second.denseScores.begin(), currCol[colidx].second.denseScores.end(),
+ curr.GetScores().begin(),
+ newInputScores.begin(),
+ std::plus<float>());
+
+
+ //we need to sum up link weights (excluding realWordCount, which isn't in numLinkParams)
+ //if the sum is too low, then we won't expand this.
+ //TODO: dodgy! shouldn't we consider weights here? what about zero-weight params?
+ inputScoreSum = std::accumulate(newInputScores.begin(),newInputScores.begin()+m_numInputScores,0.0);
+ }
+
+ Phrase newSrc(curr.src);
+ if(!isEpsilon) newSrc.AddWord(w);
+ if(newRange.second<srcSize && inputScoreSum>LOWEST_SCORE) {
+ // if there is more room to grow, add a new state onto the queue
+ // to be explored that represents [begin, curEnd+)
+ stack.push_back(State(newRange,nextP,newInputScores));
+ stack.back().src=newSrc;
+ }
+
+ std::vector<StringTgtCand> tcands;
+ // now, look up the target candidates (aprx. TargetPhraseCollection) for
+ // the current path through the CN
+ m_dict->GetTargetCandidates(nextP,tcands);
+
+ if(newRange.second>=exploredPaths.size()+newRange.first)
+ exploredPaths.resize(newRange.second-newRange.first+1,0);
+ ++exploredPaths[newRange.second-newRange.first];
+
+ totalE+=tcands.size();
+
+ if(tcands.size()) {
+ E2Costs& e2costs=cov2cand[newRange];
+ Phrase const* srcPtr=uniqSrcPhr(newSrc);
+ for(size_t i=0; i<tcands.size(); ++i) {
+ //put input scores in first - already logged, just drop in directly
+ std::vector<float> transcores(m_obj->GetNumScoreComponents());
+ UTIL_THROW_IF2(transcores.size() != weightTrans.size(),
+ "Incorrect number of translation scores");
+
+ //put in phrase table scores, logging as we insert
+ std::transform(tcands[i].scores.begin()
+ ,tcands[i].scores.end()
+ ,transcores.begin()
+ ,TransformScore);
+
+
+ //tally up
+ float score=std::inner_product(transcores.begin(), transcores.end(), weightTrans.begin(), 0.0f);
+
+ // input feature
+ score +=std::inner_product(newInputScores.begin(), newInputScores.end(), weightInput.begin(), 0.0f);
+
+ //count word penalty
+ score-=tcands[i].tokens.size() * weightWP;
+
+ std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(tcands[i].tokens,TScores()));
+
+ if(p.second) ++distinctE;
+
+ TScores & scores=p.first->second;
+ if(p.second || scores.total<score) {
+ scores.total=score;
+ scores.transScore=transcores;
+ scores.inputScores=newInputScores;
+ scores.src=srcPtr;
+ }
+ }
+ }
+ }
+ }
+ } // end while(!stack.empty())
+
+
+ if (StaticData::Instance().GetVerboseLevel() >= 2 && exploredPaths.size()) {
+ TRACE_ERR("CN (explored): ");
+ std::copy(exploredPaths.begin()+1,exploredPaths.end(),
+ std::ostream_iterator<size_t>(std::cerr," "));
+ TRACE_ERR("\n");
+ }
+
+ if(pathExplored.size()<exploredPaths.size())
+ pathExplored.resize(exploredPaths.size(),0);
+ for(size_t len=1; len<=srcSize; ++len)
+ pathExplored[len]+=exploredPaths[len];
+
+
+ m_rangeCache.resize(src.GetSize(),vTPC(src.GetSize(),0));
+
+ for(std::map<Range,E2Costs>::const_iterator i=cov2cand.begin(); i!=cov2cand.end(); ++i) {
+ assert(i->first.first<m_rangeCache.size());
+ assert(i->first.second>0);
+ assert(static_cast<size_t>(i->first.second-1)<m_rangeCache[i->first.first].size());
+ assert(m_rangeCache[i->first.first][i->first.second-1]==0);
+
+ std::vector<TargetPhrase> tCands;
+ tCands.reserve(i->second.size());
+
+ std::vector<std::pair<float,size_t> > costs;
+ costs.reserve(i->second.size());
+
+ std::vector<Phrase> sourcePhrases;
+ sourcePhrases.reserve(i->second.size());
+
+ for(E2Costs::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) {
+ TScores const & scores=j->second;
+ TargetPhrase targetPhrase(m_obj);
+ CreateTargetPhrase(targetPhrase
+ , j ->first
+ , scores.transScore
+ , scores.inputScores
+ , NULL
+ , scores.src);
+ costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
+ tCands.push_back(targetPhrase);
+
+ sourcePhrases.push_back(*scores.src);
+
+ //std::cerr << i->first.first << "-" << i->first.second << ": " << targetPhrase << std::endl;
+ }
+
+ TargetPhraseCollectionWithSourcePhrase *rv=PruneTargetCandidates(tCands, costs, sourcePhrases);
+
+ if(rv->IsEmpty())
+ delete rv;
+ else {
+ m_rangeCache[i->first.first][i->first.second-1]=rv;
+ m_tgtColls.push_back(rv);
+ }
+ }
+ // free memory
+ m_dict->FreeMemory();
+}
+
+void PDTAimp::CreateTargetPhrase(TargetPhrase& targetPhrase,
+ StringTgtCand::Tokens const& factorStrings,
+ Scores const& transVector,
+ Scores const& inputVector,
+ const std::string *alignmentString,
+ Phrase const* srcPtr) const {
+ FactorCollection &factorCollection = FactorCollection::Instance();
+
+ for(size_t k=0; k<factorStrings.size(); ++k) {
+ util::TokenIter<util::MultiCharacter, false> word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter());
+ Word& w=targetPhrase.AddWord();
+ for(size_t l=0; l<m_output.size(); ++l, ++word) {
+ w[m_output[l]]= factorCollection.AddFactor(*word);
+ }
+ }
+
+ if (alignmentString) {
+ targetPhrase.SetAlignmentInfo(*alignmentString);
+ }
+
+ if (m_numInputScores) {
+ targetPhrase.GetScoreBreakdown().Assign(m_inputFeature, inputVector);
+ }
+
+ targetPhrase.GetScoreBreakdown().Assign(m_obj, transVector);
+ targetPhrase.EvaluateInIsolation(*srcPtr, m_obj->GetFeaturesToApply());
+}
+
+TargetPhraseCollectionWithSourcePhrase* PDTAimp::PruneTargetCandidates
+(const std::vector<TargetPhrase> & tCands,
+ std::vector<std::pair<float,size_t> >& costs,
+ const std::vector<Phrase> &sourcePhrases) const {
+ // convert into TargetPhraseCollection
+ UTIL_THROW_IF2(tCands.size() != sourcePhrases.size(),
+ "Number of target phrases must equal number of source phrases");
+
+ TargetPhraseCollectionWithSourcePhrase *rv=new TargetPhraseCollectionWithSourcePhrase;
+
+
+ // set limit to tableLimit or actual size, whatever is smaller
+ std::vector<std::pair<float,size_t> >::iterator nth =
+ costs.begin() + ((m_obj->m_tableLimit>0 && // 0 indicates no limit
+ m_obj->m_tableLimit < costs.size()) ?
+ m_obj->m_tableLimit : costs.size());
+
+ // find the nth phrase according to future cost
+ NTH_ELEMENT3(costs.begin(),nth ,costs.end());
+
+ // add n top phrases to the return list
+ for(std::vector<std::pair<float,size_t> >::iterator
+ it = costs.begin(); it != nth; ++it) {
+ size_t ind = it->second;
+ TargetPhrase *targetPhrase = new TargetPhrase(tCands[ind]);
+ const Phrase &sourcePhrase = sourcePhrases[ind];
+ rv->Add(targetPhrase, sourcePhrase);
+
+ }
+
+ return rv;
+}
+
+}
+
+
diff --git a/moses/PDTAimp.h b/moses/PDTAimp.h
index fefa39db2..01de1e88a 100644
--- a/moses/PDTAimp.h
+++ b/moses/PDTAimp.h
@@ -38,23 +38,7 @@ class PDTAimp
friend class PhraseDictionaryTreeAdaptor;
protected:
- PDTAimp(PhraseDictionaryTreeAdaptor *p)
- : m_dict(0),
- m_obj(p),
- useCache(1),
- totalE(0),
- distinctE(0) {
- m_numInputScores = 0;
- const StaticData &staticData = StaticData::Instance();
- m_inputFeature = &InputFeature::Instance();
-
- if (m_inputFeature) {
- const PhraseDictionary *firstPt = PhraseDictionary::GetColl()[0];
- if (firstPt == m_obj) {
- m_numInputScores = m_inputFeature->GetNumScoreComponents();
- }
- }
- }
+ PDTAimp(PhraseDictionaryTreeAdaptor *p);
public:
std::vector<FactorType> m_input,m_output;
@@ -77,195 +61,22 @@ public:
std::vector<size_t> path1Best,pathExplored;
std::vector<double> pathCN;
- ~PDTAimp() {
- CleanUp();
- delete m_dict;
-
- if (StaticData::Instance().GetVerboseLevel() >= 2) {
-
- TRACE_ERR("tgt candidates stats: total="<<totalE<<"; distinct="
- <<distinctE<<" ("<<distinctE/(0.01*totalE)<<"); duplicates="
- <<totalE-distinctE<<" ("<<(totalE-distinctE)/(0.01*totalE)
- <<")\n");
-
- TRACE_ERR("\npath statistics\n");
-
- if(path1Best.size()) {
- TRACE_ERR("1-best: ");
- std::copy(path1Best.begin()+1,path1Best.end(),
- std::ostream_iterator<size_t>(std::cerr," \t"));
- TRACE_ERR("\n");
- }
- if(pathCN.size()) {
- TRACE_ERR("CN (full): ");
- std::transform(pathCN.begin()+1
- ,pathCN.end()
- ,std::ostream_iterator<double>(std::cerr," \t")
- ,Exp);
- TRACE_ERR("\n");
- }
- if(pathExplored.size()) {
- TRACE_ERR("CN (explored): ");
- std::copy(pathExplored.begin()+1,pathExplored.end(),
- std::ostream_iterator<size_t>(std::cerr," \t"));
- TRACE_ERR("\n");
- }
- }
-
- }
+ ~PDTAimp();
void Factors2String(Word const& w,std::string& s) const {
s=w.GetString(m_input,false);
}
- void CleanUp() {
- assert(m_dict);
- m_dict->FreeMemory();
- for(size_t i=0; i<m_tgtColls.size(); ++i) delete m_tgtColls[i];
- m_tgtColls.clear();
- m_cache.clear();
- m_rangeCache.clear();
- uniqSrcPhr.clear();
- }
+ void CleanUp();
TargetPhraseCollectionWithSourcePhrase const*
- GetTargetPhraseCollection(Phrase const &src) const {
-
- VERBOSE(2,"PDTAimp::GetTargetPhraseCollection" << std::endl);
- VERBOSE(3,"PDTAimp::GetTargetPhraseCollection: src:|" << src << "|" << std::endl);
-
- assert(m_dict);
- if(src.GetSize()==0) return 0;
-
- std::pair<MapSrc2Tgt::iterator,bool> piter;
- if(useCache) {
- piter=m_cache.insert(std::make_pair(src,static_cast<TargetPhraseCollectionWithSourcePhrase const*>(0)));
- if(!piter.second) {
- if (piter.first->second) {
- VERBOSE(3,"PDTAimp::GetTargetPhraseCollection: piter.first->second->GetSize():" << (piter.first->second)->GetSize() << std::endl);
- } else {
- VERBOSE(3,"PDTAimp::GetTargetPhraseCollection: piter.first->second->GetSize():" << 0 << std::endl);
- }
- return piter.first->second;
- }
- } else if (m_cache.size()) {
- MapSrc2Tgt::const_iterator i=m_cache.find(src);
- if (i!=m_cache.end()) {
- if (i->second) {
- VERBOSE(3,"PDTAimp::GetTargetPhraseCollection: i->second->GetSize():" << (void*) (i->second) << std::endl);
- } else {
- VERBOSE(3,"PDTAimp::GetTargetPhraseCollection: i->second->GetSize():" << 0 << std::endl);
- }
- } else {
- VERBOSE(3,"PDTAimp::GetTargetPhraseCollection: i->second->GetSize():" << 0 << std::endl);
- }
- return (i!=m_cache.end() ? i->second : 0);
- }
-
- std::vector<std::string> srcString(src.GetSize());
- // convert source Phrase into vector of strings
- for(size_t i=0; i<srcString.size(); ++i) {
- Factors2String(src.GetWord(i),srcString[i]);
- }
-
- // get target phrases in string representation
- std::vector<StringTgtCand> cands;
- std::vector<std::string> wacands;
- m_dict->GetTargetCandidates(srcString,cands,wacands);
- if(cands.empty()) {
- VERBOSE(3,"PDTAimp::GetTargetPhraseCollection ret->GetSize():" << 0 << std::endl);
- return 0;
- }
-
- //TODO: Multiple models broken here
- std::vector<float> weights = StaticData::Instance().GetWeights(m_obj);
-
- std::vector<TargetPhrase> tCands;
- tCands.reserve(cands.size());
-
- std::vector<std::pair<float,size_t> > costs;
- costs.reserve(cands.size());
-
- std::vector<Phrase> sourcePhrases;
- sourcePhrases.reserve(cands.size());
-
-
- // convert into TargetPhrases
- for(size_t i=0; i<cands.size(); ++i) {
- TargetPhrase targetPhrase;
-
- StringTgtCand::Tokens const& factorStrings=cands[i].tokens;
- Scores const& probVector=cands[i].scores;
-
- std::vector<float> scoreVector(probVector.size());
- std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
- TransformScore);
- std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
- FloorScore);
-
- //sparse features.
- //These are already in log-space
- for (size_t j = 0; j < cands[i].fnames.size(); ++j) {
- targetPhrase.GetScoreBreakdown().Assign(m_obj, *cands[i].fnames[j], cands[i].fvalues[j]);
- }
-
- CreateTargetPhrase(targetPhrase,factorStrings,scoreVector, Scores(0), &wacands[i], &src);
-
- costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
- tCands.push_back(targetPhrase);
-
- sourcePhrases.push_back(src);
- }
-
-
- TargetPhraseCollectionWithSourcePhrase *rv;
- rv=PruneTargetCandidates(tCands,costs, sourcePhrases);
- if(rv->IsEmpty()) {
- delete rv;
- VERBOSE(3,"PDTAimp::GetTargetPhraseCollection rv->GetSize():" << 0 << std::endl);
- return 0;
- } else {
- if(useCache) piter.first->second=rv;
- VERBOSE(3,"PDTAimp::GetTargetPhraseCollection rv->GetSize():" << rv->GetSize() << std::endl);
- m_tgtColls.push_back(rv);
- return rv;
- }
-
- }
-
-
+ GetTargetPhraseCollection(Phrase const &src) const;
void Create(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
- , const std::vector<float> &weight
- ) {
-
- // set my members
- m_dict=new PhraseDictionaryTree();
- m_input=input;
- m_output=output;
+ , const std::vector<float> &weight);
- const StaticData &staticData = StaticData::Instance();
- m_dict->NeedAlignmentInfo(staticData.NeedAlignmentInfo());
-
- std::string binFname=filePath+".binphr.idx";
- if(!FileExists(binFname.c_str())) {
- UTIL_THROW2( "bin ttable does not exist");
- //TRACE_ERR( "bin ttable does not exist -> create it\n");
- //InputFileStream in(filePath);
- //m_dict->Create(in,filePath);
- }
- TRACE_ERR( "reading bin ttable\n");
-// m_dict->Read(filePath);
- bool res=m_dict->Read(filePath);
- if (!res) {
- std::stringstream strme;
- strme << "bin ttable was read in a wrong way\n";
- UserMessage::Add(strme.str());
- exit(1);
- }
- }
typedef PhraseDictionaryTree::PrefixPtr PPtr;
typedef unsigned short Position;
@@ -308,61 +119,13 @@ public:
Scores const& transVector,
Scores const& inputVector,
const std::string *alignmentString,
- Phrase const* srcPtr=0) const {
- FactorCollection &factorCollection = FactorCollection::Instance();
-
- for(size_t k=0; k<factorStrings.size(); ++k) {
- util::TokenIter<util::MultiCharacter, false> word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter());
- Word& w=targetPhrase.AddWord();
- for(size_t l=0; l<m_output.size(); ++l, ++word) {
- w[m_output[l]]= factorCollection.AddFactor(*word);
- }
- }
-
- if (alignmentString) {
- targetPhrase.SetAlignmentInfo(*alignmentString);
- }
-
- if (m_numInputScores) {
- targetPhrase.GetScoreBreakdown().Assign(m_inputFeature, inputVector);
- }
-
- targetPhrase.GetScoreBreakdown().Assign(m_obj, transVector);
- targetPhrase.Evaluate(*srcPtr, m_obj->GetFeaturesToApply());
- }
+ Phrase const* srcPtr=0) const;
TargetPhraseCollectionWithSourcePhrase* PruneTargetCandidates
(const std::vector<TargetPhrase> & tCands,
std::vector<std::pair<float,size_t> >& costs,
- const std::vector<Phrase> &sourcePhrases) const {
- // convert into TargetPhraseCollection
- UTIL_THROW_IF2(tCands.size() != sourcePhrases.size(),
- "Number of target phrases must equal number of source phrases");
-
- TargetPhraseCollectionWithSourcePhrase *rv=new TargetPhraseCollectionWithSourcePhrase;
-
+ const std::vector<Phrase> &sourcePhrases) const;
- // set limit to tableLimit or actual size, whatever is smaller
- std::vector<std::pair<float,size_t> >::iterator nth =
- costs.begin() + ((m_obj->m_tableLimit>0 && // 0 indicates no limit
- m_obj->m_tableLimit < costs.size()) ?
- m_obj->m_tableLimit : costs.size());
-
- // find the nth phrase according to future cost
- NTH_ELEMENT3(costs.begin(),nth ,costs.end());
-
- // add n top phrases to the return list
- for(std::vector<std::pair<float,size_t> >::iterator
- it = costs.begin(); it != nth; ++it) {
- size_t ind = it->second;
- TargetPhrase *targetPhrase = new TargetPhrase(tCands[ind]);
- const Phrase &sourcePhrase = sourcePhrases[ind];
- rv->Add(targetPhrase, sourcePhrase);
-
- }
-
- return rv;
- }
// POD for target phrase scores
struct TScores {
@@ -373,222 +136,7 @@ public:
TScores() : total(0.0),src(0) {}
};
- void CacheSource(ConfusionNet const& src) {
- assert(m_dict);
- const size_t srcSize=src.GetSize();
-
- std::vector<size_t> exploredPaths(srcSize+1,0);
- std::vector<double> exPathsD(srcSize+1,-1.0);
-
- // collect some statistics
- std::vector<size_t> cnDepths(srcSize,0);
- for(size_t i=0; i<srcSize; ++i) cnDepths[i]=src[i].size();
-
- for(size_t len=1; len<=srcSize; ++len)
- for(size_t i=0; i<=srcSize-len; ++i) {
- double pd=0.0;
- for(size_t k=i; k<i+len; ++k) pd+=log(1.0*cnDepths[k]);
- exPathsD[len]=(exPathsD[len]>=0.0 ? addLogScale(pd,exPathsD[len]) : pd);
- }
-
- // update global statistics
- if(pathCN.size()<=srcSize) pathCN.resize(srcSize+1,-1.0);
- for(size_t len=1; len<=srcSize; ++len)
- pathCN[len]=pathCN[len]>=0.0 ? addLogScale(pathCN[len],exPathsD[len]) : exPathsD[len];
-
- if(path1Best.size()<=srcSize) path1Best.resize(srcSize+1,0);
- for(size_t len=1; len<=srcSize; ++len) path1Best[len]+=srcSize-len+1;
-
-
- if (StaticData::Instance().GetVerboseLevel() >= 2 && exPathsD.size()) {
- TRACE_ERR("path stats for current CN: \nCN (full): ");
- std::transform(exPathsD.begin()+1
- ,exPathsD.end()
- ,std::ostream_iterator<double>(std::cerr," ")
- ,Exp);
- TRACE_ERR("\n");
- }
-
- typedef StringTgtCand::Tokens sPhrase;
- typedef std::map<StringTgtCand::Tokens,TScores> E2Costs;
-
- std::map<Range,E2Costs> cov2cand;
- std::vector<State> stack;
- for(Position i=0 ; i < srcSize ; ++i)
- stack.push_back(State(i, i, m_dict->GetRoot(), std::vector<float>(m_numInputScores,0.0)));
-
- std::vector<float> weightTrans = StaticData::Instance().GetWeights(m_obj);
- std::vector<float> weightInput = StaticData::Instance().GetWeights(m_inputFeature);
- float weightWP = StaticData::Instance().GetWeightWordPenalty();
-// float weightPP = StaticData::Instance().GetWeightPhrasePenalty();
-
- while(!stack.empty()) {
- State curr(stack.back());
- stack.pop_back();
-
- UTIL_THROW_IF2(curr.end() >= srcSize, "Error");
- const ConfusionNet::Column &currCol=src[curr.end()];
- // in a given column, loop over all possibilities
- for(size_t colidx=0; colidx<currCol.size(); ++colidx) {
- const Word& w=currCol[colidx].first; // w=the i^th possibility in column colidx
- std::string s;
- Factors2String(w,s);
- bool isEpsilon=(s=="" || s==EPSILON);
-
- //assert that we have the right number of link params in this CN option
- UTIL_THROW_IF2(currCol[colidx].second.denseScores.size() < m_numInputScores,
- "Incorrect number of input scores");
-
- // do not start with epsilon (except at first position)
- if(isEpsilon && curr.begin()==curr.end() && curr.begin()>0) continue;
-
- // At a given node in the prefix tree, look to see if w defines an edge to
- // another node (Extend). Stay at the same node if w==EPSILON
- PPtr nextP = (isEpsilon ? curr.ptr : m_dict->Extend(curr.ptr,s));
-
- if(nextP) { // w is a word that should be considered
- Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx));
-
- //add together the link scores from the current state and the new arc
- float inputScoreSum = 0;
- std::vector<float> newInputScores(m_numInputScores,0.0);
- if (m_numInputScores) {
- std::transform(currCol[colidx].second.denseScores.begin(), currCol[colidx].second.denseScores.end(),
- curr.GetScores().begin(),
- newInputScores.begin(),
- std::plus<float>());
-
-
- //we need to sum up link weights (excluding realWordCount, which isn't in numLinkParams)
- //if the sum is too low, then we won't expand this.
- //TODO: dodgy! shouldn't we consider weights here? what about zero-weight params?
- inputScoreSum = std::accumulate(newInputScores.begin(),newInputScores.begin()+m_numInputScores,0.0);
- }
-
- Phrase newSrc(curr.src);
- if(!isEpsilon) newSrc.AddWord(w);
- if(newRange.second<srcSize && inputScoreSum>LOWEST_SCORE) {
- // if there is more room to grow, add a new state onto the queue
- // to be explored that represents [begin, curEnd+)
- stack.push_back(State(newRange,nextP,newInputScores));
- stack.back().src=newSrc;
- }
-
- std::vector<StringTgtCand> tcands;
- // now, look up the target candidates (aprx. TargetPhraseCollection) for
- // the current path through the CN
- m_dict->GetTargetCandidates(nextP,tcands);
-
- if(newRange.second>=exploredPaths.size()+newRange.first)
- exploredPaths.resize(newRange.second-newRange.first+1,0);
- ++exploredPaths[newRange.second-newRange.first];
-
- totalE+=tcands.size();
-
- if(tcands.size()) {
- E2Costs& e2costs=cov2cand[newRange];
- Phrase const* srcPtr=uniqSrcPhr(newSrc);
- for(size_t i=0; i<tcands.size(); ++i) {
- //put input scores in first - already logged, just drop in directly
- std::vector<float> transcores(m_obj->GetNumScoreComponents());
- UTIL_THROW_IF2(transcores.size() != weightTrans.size(),
- "Incorrect number of translation scores");
-
- //put in phrase table scores, logging as we insert
- std::transform(tcands[i].scores.begin()
- ,tcands[i].scores.end()
- ,transcores.begin()
- ,TransformScore);
-
-
- //tally up
- float score=std::inner_product(transcores.begin(), transcores.end(), weightTrans.begin(), 0.0f);
-
- // input feature
- score +=std::inner_product(newInputScores.begin(), newInputScores.end(), weightInput.begin(), 0.0f);
-
- //count word penalty
- score-=tcands[i].tokens.size() * weightWP;
-// score-=weightPP;
-
- std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(tcands[i].tokens,TScores()));
-
- if(p.second) ++distinctE;
-
- TScores & scores=p.first->second;
- if(p.second || scores.total<score) {
- scores.total=score;
- scores.transScore=transcores;
- scores.inputScores=newInputScores;
- scores.src=srcPtr;
- }
- }
- }
- }
- }
- } // end while(!stack.empty())
-
-
- if (StaticData::Instance().GetVerboseLevel() >= 2 && exploredPaths.size()) {
- TRACE_ERR("CN (explored): ");
- std::copy(exploredPaths.begin()+1,exploredPaths.end(),
- std::ostream_iterator<size_t>(std::cerr," "));
- TRACE_ERR("\n");
- }
-
- if(pathExplored.size()<exploredPaths.size())
- pathExplored.resize(exploredPaths.size(),0);
- for(size_t len=1; len<=srcSize; ++len)
- pathExplored[len]+=exploredPaths[len];
-
-
- m_rangeCache.resize(src.GetSize(),vTPC(src.GetSize(),0));
-
- for(std::map<Range,E2Costs>::const_iterator i=cov2cand.begin(); i!=cov2cand.end(); ++i) {
- assert(i->first.first<m_rangeCache.size());
- assert(i->first.second>0);
- assert(static_cast<size_t>(i->first.second-1)<m_rangeCache[i->first.first].size());
- assert(m_rangeCache[i->first.first][i->first.second-1]==0);
-
- std::vector<TargetPhrase> tCands;
- tCands.reserve(i->second.size());
-
- std::vector<std::pair<float,size_t> > costs;
- costs.reserve(i->second.size());
-
- std::vector<Phrase> sourcePhrases;
- sourcePhrases.reserve(i->second.size());
-
- for(E2Costs::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) {
- TScores const & scores=j->second;
- TargetPhrase targetPhrase;
- CreateTargetPhrase(targetPhrase
- , j ->first
- , scores.transScore
- , scores.inputScores
- , NULL
- , scores.src);
- costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
- tCands.push_back(targetPhrase);
-
- sourcePhrases.push_back(*scores.src);
-
- //std::cerr << i->first.first << "-" << i->first.second << ": " << targetPhrase << std::endl;
- }
-
- TargetPhraseCollectionWithSourcePhrase *rv=PruneTargetCandidates(tCands, costs, sourcePhrases);
-
- if(rv->IsEmpty())
- delete rv;
- else {
- m_rangeCache[i->first.first][i->first.second-1]=rv;
- m_tgtColls.push_back(rv);
- }
- }
- // free memory
- m_dict->FreeMemory();
- }
-
+ void CacheSource(ConfusionNet const& src);
size_t GetNumInputScores() const {
return m_numInputScores;
diff --git a/moses/PP/CountsPhraseProperty.cpp b/moses/PP/CountsPhraseProperty.cpp
new file mode 100644
index 000000000..b64366733
--- /dev/null
+++ b/moses/PP/CountsPhraseProperty.cpp
@@ -0,0 +1,38 @@
+#include "moses/PP/CountsPhraseProperty.h"
+#include <sstream>
+#include <assert.h>
+
+namespace Moses
+{
+
+void CountsPhraseProperty::ProcessValue(const std::string &value)
+{
+ std::istringstream tokenizer(value);
+
+ if (! (tokenizer >> m_targetMarginal)) { // first token: countE
+ UTIL_THROW2("CountsPhraseProperty: Not able to read target marginal. Flawed property?");
+ }
+ assert( m_targetMarginal > 0 );
+
+ if (! (tokenizer >> m_sourceMarginal)) { // first token: countF
+ UTIL_THROW2("CountsPhraseProperty: Not able to read source marginal. Flawed property?");
+ }
+ assert( m_sourceMarginal > 0 );
+
+ if (! (tokenizer >> m_jointCount)) { // first token: countEF
+ UTIL_THROW2("CountsPhraseProperty: Not able to read joint count. Flawed property?");
+ }
+ assert( m_jointCount > 0 );
+};
+
+std::ostream& operator<<(std::ostream &out, const CountsPhraseProperty &obj)
+{
+ out << "Count property="
+ << obj.GetTargetMarginal() << " "
+ << obj.GetSourceMarginal() << " "
+ << obj.GetJointCount();
+ return out;
+}
+
+} // namespace Moses
+
diff --git a/moses/PP/CountsPhraseProperty.h b/moses/PP/CountsPhraseProperty.h
new file mode 100644
index 000000000..4f6fbcfa8
--- /dev/null
+++ b/moses/PP/CountsPhraseProperty.h
@@ -0,0 +1,62 @@
+
+#pragma once
+
+#include "moses/PP/PhraseProperty.h"
+#include "util/exception.hh"
+#include <string>
+#include <list>
+
+namespace Moses
+{
+
+// A simple phrase property class to access the three phrase count values.
+//
+// The counts are usually not needed during decoding and are not loaded
+// from the phrase table. This is just a workaround that can make them
+// available to features which have a use for them.
+//
+// If you need access to the counts, copy the two marginal counts and the
+// joint count into an additional information property with key "Counts",
+// e.g. using awk:
+//
+// $ zcat phrase-table.gz | awk -F' \|\|\| ' '{printf("%s {{Counts %s}}\n",$0,$5);}' | gzip -c > phrase-table.withCountsPP.gz
+//
+// CountsPhraseProperty reads them from the phrase table and provides
+// methods GetSourceMarginal(), GetTargetMarginal(), GetJointCount().
+
+
+class CountsPhraseProperty : public PhraseProperty
+{
+ friend std::ostream& operator<<(std::ostream &, const CountsPhraseProperty &);
+
+public:
+
+ CountsPhraseProperty() {};
+
+ virtual void ProcessValue(const std::string &value);
+
+ size_t GetSourceMarginal() const {
+ return m_sourceMarginal;
+ }
+
+ size_t GetTargetMarginal() const {
+ return m_targetMarginal;
+ }
+
+ float GetJointCount() const {
+ return m_jointCount;
+ }
+
+ virtual const std::string *GetValueString() const {
+ UTIL_THROW2("CountsPhraseProperty: value string not available in this phrase property");
+ return NULL;
+ };
+
+protected:
+
+ float m_sourceMarginal, m_targetMarginal, m_jointCount;
+
+};
+
+} // namespace Moses
+
diff --git a/moses/PP/Factory.cpp b/moses/PP/Factory.cpp
index 42e4f5ea7..cc393b18d 100644
--- a/moses/PP/Factory.cpp
+++ b/moses/PP/Factory.cpp
@@ -4,7 +4,12 @@
#include <iostream>
#include <vector>
+#include "moses/PP/CountsPhraseProperty.h"
+#include "moses/PP/SourceLabelsPhraseProperty.h"
#include "moses/PP/TreeStructurePhraseProperty.h"
+#include "moses/PP/SpanLengthPhraseProperty.h"
+#include "moses/PP/NonTermContextProperty.h"
+#include "moses/PP/OrientationPhraseProperty.h"
namespace Moses
{
@@ -34,8 +39,8 @@ template <class P> class DefaultPhrasePropertyCreator : public PhrasePropertyCre
{
public:
boost::shared_ptr<PhraseProperty> CreateProperty(const std::string &value) {
- P* property = new P(value);
- property->ProcessValue();
+ P* property = new P();
+ property->ProcessValue(value);
return Create(property);
}
};
@@ -50,8 +55,12 @@ PhrasePropertyFactory::PhrasePropertyFactory()
// Properties with different key than class.
#define MOSES_PNAME2(name, type) Add(name, new DefaultPhrasePropertyCreator< type >());
+ MOSES_PNAME2("Counts", CountsPhraseProperty);
+ MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty);
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
-
+ MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
+ MOSES_PNAME2("NonTermContext", NonTermContextProperty);
+ MOSES_PNAME2("Orientation", OrientationPhraseProperty);
}
PhrasePropertyFactory::~PhrasePropertyFactory()
diff --git a/moses/PP/NonTermContextProperty.cpp b/moses/PP/NonTermContextProperty.cpp
new file mode 100644
index 000000000..df5e88d8e
--- /dev/null
+++ b/moses/PP/NonTermContextProperty.cpp
@@ -0,0 +1,137 @@
+#include "moses/PP/NonTermContextProperty.h"
+#include <string>
+#include <assert.h>
+#include "moses/Util.h"
+#include "moses/FactorCollection.h"
+
+using namespace std;
+
+namespace Moses
+{
+NonTermContextProperty::NonTermContextProperty()
+{
+}
+
+NonTermContextProperty::~NonTermContextProperty()
+{
+ //RemoveAllInColl(m_probStores);
+}
+
+void NonTermContextProperty::ProcessValue(const std::string &value)
+{
+ vector<string> toks;
+ Tokenize(toks, value);
+
+ FactorCollection &fc = FactorCollection::Instance();
+
+ size_t numNT = Scan<size_t>(toks[0]);
+ m_probStores.resize(numNT);
+
+ size_t ind = 1;
+ while (ind < toks.size()) {
+ vector<const Factor *> factors;
+
+ for (size_t nt = 0; nt < numNT; ++nt) {
+ size_t ntInd = Scan<size_t>(toks[ind]);
+ assert(nt == ntInd);
+ ++ind;
+
+ for (size_t contextInd = 0; contextInd < 4; ++contextInd) {
+ //cerr << "toks[" << ind << "]=" << toks[ind] << endl;
+ const Factor *factor = fc.AddFactor(toks[ind], false);
+ factors.push_back(factor);
+ ++ind;
+ }
+ }
+
+ // done with the context. Just get the count and put it all into data structures
+ // cerr << "count=" << toks[ind] << endl;
+ float count = Scan<float>(toks[ind]);
+ ++ind;
+
+ for (size_t i = 0; i < factors.size(); ++i) {
+ size_t ntInd = i / 4;
+ size_t contextInd = i % 4;
+ const Factor *factor = factors[i];
+ AddToMap(ntInd, contextInd, factor, count);
+ }
+ }
+}
+
+void NonTermContextProperty::AddToMap(size_t ntIndex, size_t index, const Factor *factor, float count)
+{
+ if (ntIndex <= m_probStores.size()) {
+ m_probStores.resize(ntIndex + 1);
+ }
+
+ ProbStore &probStore = m_probStores[ntIndex];
+ probStore.AddToMap(index, factor, count);
+}
+
+float NonTermContextProperty::GetProb(size_t ntInd,
+ size_t contextInd,
+ const Factor *factor,
+ float smoothConstant) const
+{
+ UTIL_THROW_IF2(ntInd >= m_probStores.size(), "Invalid nt index=" << ntInd);
+ const ProbStore &probStore = m_probStores[ntInd];
+ float ret = probStore.GetProb(contextInd, factor, smoothConstant);
+ return ret;
+}
+
+//////////////////////////////////////////
+
+void NonTermContextProperty::ProbStore::AddToMap(size_t index, const Factor *factor, float count)
+{
+ Map &map = m_vec[index];
+
+ Map::iterator iter = map.find(factor);
+ if (iter == map.end()) {
+ map[factor] = count;
+ }
+ else {
+ float &currCount = iter->second;
+ currCount += count;
+ }
+
+ m_totalCount += count;
+}
+
+
+float NonTermContextProperty::ProbStore::GetProb(size_t contextInd,
+ const Factor *factor,
+ float smoothConstant) const
+{
+ float count = GetCount(contextInd, factor, smoothConstant);
+ float total = GetTotalCount(contextInd, smoothConstant);
+ float ret = count / total;
+ return ret;
+}
+
+float NonTermContextProperty::ProbStore::GetCount(size_t contextInd,
+ const Factor *factor,
+ float smoothConstant) const
+{
+ const Map &map = m_vec[contextInd];
+
+ float count = smoothConstant;
+ Map::const_iterator iter = map.find(factor);
+ if (iter == map.end()) {
+ // nothing
+ }
+ else {
+ count += iter->second;
+ }
+
+ return count;
+}
+
+float NonTermContextProperty::ProbStore::GetTotalCount(size_t contextInd, float smoothConstant) const
+{
+ const Map &map = m_vec[contextInd];
+ return m_totalCount + smoothConstant * map.size();
+}
+
+
+} // namespace Moses
+
diff --git a/moses/PP/NonTermContextProperty.h b/moses/PP/NonTermContextProperty.h
new file mode 100644
index 000000000..56db9cb32
--- /dev/null
+++ b/moses/PP/NonTermContextProperty.h
@@ -0,0 +1,73 @@
+
+#pragma once
+
+#include "moses/PP/PhraseProperty.h"
+#include "util/exception.hh"
+#include <string>
+#include <list>
+#include <map>
+#include <vector>
+
+namespace Moses
+{
+class Factor;
+
+class NonTermContextProperty : public PhraseProperty
+{
+public:
+
+ NonTermContextProperty();
+ ~NonTermContextProperty();
+
+ virtual void ProcessValue(const std::string &value);
+
+ virtual const std::string *GetValueString() const {
+ UTIL_THROW2("NonTermContextProperty: value string not available in this phrase property");
+ return NULL;
+ };
+
+ float GetProb(size_t ntInd,
+ size_t contextInd,
+ const Factor *factor,
+ float smoothConstant) const;
+
+protected:
+
+ class ProbStore {
+ typedef std::map<const Factor*, float> Map; // map word -> prob
+ typedef std::vector<Map> Vec; // left outside, left inside, right inside, right outside
+ Vec m_vec;
+ float m_totalCount;
+
+ float GetCount(size_t contextInd,
+ const Factor *factor,
+ float smoothConstant) const;
+ float GetTotalCount(size_t contextInd, float smoothConstant) const;
+
+ public:
+
+ ProbStore()
+ :m_vec(4)
+ ,m_totalCount(0)
+ {}
+
+ float GetProb(size_t contextInd,
+ const Factor *factor,
+ float smoothConstant) const;
+
+ float GetSize(size_t index) const
+ { return m_vec[index].size(); }
+
+ void AddToMap(size_t index, const Factor *factor, float count);
+
+ };
+
+ // by nt index
+ std::vector<ProbStore> m_probStores;
+
+ void AddToMap(size_t ntIndex, size_t index, const Factor *factor, float count);
+
+};
+
+} // namespace Moses
+
diff --git a/moses/PP/OrientationPhraseProperty.cpp b/moses/PP/OrientationPhraseProperty.cpp
new file mode 100644
index 000000000..1722a5383
--- /dev/null
+++ b/moses/PP/OrientationPhraseProperty.cpp
@@ -0,0 +1,26 @@
+#include "moses/PP/OrientationPhraseProperty.h"
+#include <iostream>
+
+
+namespace Moses
+{
+
+void OrientationPhraseProperty::ProcessValue(const std::string &value)
+{
+ // bidirectional MSLR phrase orientation with 2x4 orientation classes:
+ // mono swap dleft dright
+
+ std::istringstream tokenizer(value);
+
+ try {
+ if (! (tokenizer >> m_l2rMonoProbability >> m_l2rSwapProbability >> m_l2rDleftProbability >> m_l2rDrightProbability
+ >> m_r2lMonoProbability >> m_r2lSwapProbability >> m_r2lDleftProbability >> m_r2lDrightProbability)) {
+ UTIL_THROW2("OrientationPhraseProperty: Not able to read value. Flawed property?");
+ }
+ } catch (const std::exception &e) {
+ UTIL_THROW2("OrientationPhraseProperty: Read error. Flawed property?");
+ }
+};
+
+} // namespace Moses
+
diff --git a/moses/PP/OrientationPhraseProperty.h b/moses/PP/OrientationPhraseProperty.h
new file mode 100644
index 000000000..f6344062c
--- /dev/null
+++ b/moses/PP/OrientationPhraseProperty.h
@@ -0,0 +1,73 @@
+
+#pragma once
+
+#include "moses/PP/PhraseProperty.h"
+#include "util/exception.hh"
+#include <string>
+
+namespace Moses
+{
+
+class OrientationPhraseProperty : public PhraseProperty
+{
+public:
+ OrientationPhraseProperty() {};
+
+ virtual void ProcessValue(const std::string &value);
+
+
+ double GetLeftToRightProbabilityMono() const {
+ return m_l2rMonoProbability;
+ };
+
+ double GetLeftToRightProbabilitySwap() const {
+ return m_l2rSwapProbability;
+ };
+
+ double GetLeftToRightProbabilityDleft() const {
+ return m_l2rDleftProbability;
+ };
+
+ double GetLeftToRightProbabilityDright() const {
+ return m_l2rDrightProbability;
+ };
+
+ double GetLeftToRightProbabilityDiscontinuous() const {
+ return m_l2rDleftProbability + m_l2rDrightProbability;
+ };
+
+
+ double GetRightToLeftProbabilityMono() const {
+ return m_r2lMonoProbability;
+ };
+
+ double GetRightToLeftProbabilitySwap() const {
+ return m_r2lSwapProbability;
+ };
+
+ double GetRightToLeftProbabilityDleft() const {
+ return m_r2lDleftProbability;
+ };
+
+ double GetRightToLeftProbabilityDright() const {
+ return m_r2lDrightProbability;
+ };
+
+ double GetRightToLeftProbabilityDiscontinuous() const {
+ return m_r2lDleftProbability + m_r2lDrightProbability;
+ };
+
+
+ virtual const std::string *GetValueString() const {
+ UTIL_THROW2("OrientationPhraseProperty: value string not available in this phrase property");
+ return NULL;
+ };
+
+protected:
+
+ float m_l2rMonoProbability, m_l2rSwapProbability, m_l2rDrightProbability, m_l2rDleftProbability,
+ m_r2lMonoProbability, m_r2lSwapProbability, m_r2lDrightProbability, m_r2lDleftProbability;
+};
+
+} // namespace Moses
+
diff --git a/moses/PP/PhraseProperty.cpp b/moses/PP/PhraseProperty.cpp
new file mode 100644
index 000000000..614b39c60
--- /dev/null
+++ b/moses/PP/PhraseProperty.cpp
@@ -0,0 +1,13 @@
+#include "PhraseProperty.h"
+
+namespace Moses
+{
+
+std::ostream& operator<<(std::ostream &out, const PhraseProperty &obj)
+{
+ out << "Base phrase property";
+ return out;
+}
+
+}
+
diff --git a/moses/PP/PhraseProperty.h b/moses/PP/PhraseProperty.h
index 6889afa10..b7437369b 100644
--- a/moses/PP/PhraseProperty.h
+++ b/moses/PP/PhraseProperty.h
@@ -10,18 +10,19 @@ namespace Moses
*/
class PhraseProperty
{
+ friend std::ostream& operator<<(std::ostream &, const PhraseProperty &);
+
public:
- PhraseProperty(const std::string &value) : m_value(value) {};
+ PhraseProperty() : m_value(NULL) {};
+ ~PhraseProperty() { if ( m_value != NULL ) delete m_value; };
- virtual void ProcessValue() {};
+ virtual void ProcessValue(const std::string &value) { m_value = new std::string(value); };
- const std::string &GetValueString() {
- return m_value;
- };
+ virtual const std::string *GetValueString() const { return m_value; };
protected:
- const std::string m_value;
+ std::string *m_value;
};
diff --git a/moses/PP/SourceLabelsPhraseProperty.cpp b/moses/PP/SourceLabelsPhraseProperty.cpp
new file mode 100644
index 000000000..8e6a5dd6d
--- /dev/null
+++ b/moses/PP/SourceLabelsPhraseProperty.cpp
@@ -0,0 +1,124 @@
+#include "moses/PP/SourceLabelsPhraseProperty.h"
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <string>
+#include <queue>
+#include <assert.h>
+#include <limits>
+
+namespace Moses
+{
+
+void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
+{
+ std::istringstream tokenizer(value);
+
+ if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
+ UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
+ }
+ assert( m_nNTs > 0 );
+
+ if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
+ UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
+ }
+ assert( m_totalCount > 0.0 );
+
+
+
+ // read source-labelled rule items
+
+ std::priority_queue<float> ruleLabelledCountsPQ;
+
+ while (tokenizer.peek() != EOF) {
+// try {
+
+ SourceLabelsPhrasePropertyItem item;
+ size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
+
+ if (m_nNTs == 1) {
+
+ item.m_sourceLabelsRHSCount = m_totalCount;
+
+ } else { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
+
+ for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
+ size_t sourceLabelRHS;
+ if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
+ UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
+ }
+ item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
+ }
+
+ if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
+ UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
+ }
+
+ if (! (tokenizer >> numberOfLHSsGivenRHS)) {
+ UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
+ }
+ }
+
+ for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
+ size_t sourceLabelLHS;
+ if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
+ UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
+ }
+ float ruleSourceLabelledCount;
+ if (! (tokenizer >> ruleSourceLabelledCount)) {
+ UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
+ }
+ item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
+ ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
+ }
+
+ m_sourceLabelItems.push_back(item);
+
+// } catch (const std::exception &e) {
+// UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
+// }
+ }
+
+ // keep only top N label vectors
+ const size_t N=50;
+
+ if (ruleLabelledCountsPQ.size() > N) {
+
+ float topNRuleLabelledCount = std::numeric_limits<int>::max();
+ for (size_t i=0; !ruleLabelledCountsPQ.empty() && i<N; ++i) {
+ topNRuleLabelledCount = ruleLabelledCountsPQ.top();
+ ruleLabelledCountsPQ.pop();
+ }
+
+ size_t nKept=0;
+ std::list<SourceLabelsPhrasePropertyItem>::iterator itemIter=m_sourceLabelItems.begin();
+ while (itemIter!=m_sourceLabelItems.end()) {
+ if (itemIter->m_sourceLabelsRHSCount < topNRuleLabelledCount) {
+ itemIter = m_sourceLabelItems.erase(itemIter);
+ } else {
+ std::list< std::pair<size_t,float> >::iterator itemLHSIter=(itemIter->m_sourceLabelsLHSList).begin();
+ while (itemLHSIter!=(itemIter->m_sourceLabelsLHSList).end()) {
+ if (itemLHSIter->second < topNRuleLabelledCount) {
+ itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter);
+ } else {
+ if (nKept >= N) {
+ itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter,(itemIter->m_sourceLabelsLHSList).end());
+ } else {
+ ++nKept;
+ ++itemLHSIter;
+ }
+ }
+ }
+ if ((itemIter->m_sourceLabelsLHSList).empty()) {
+ itemIter = m_sourceLabelItems.erase(itemIter);
+ } else {
+ ++itemIter;
+ }
+ }
+ }
+ }
+};
+
+} // namespace Moses
+
diff --git a/moses/PP/SourceLabelsPhraseProperty.h b/moses/PP/SourceLabelsPhraseProperty.h
new file mode 100644
index 000000000..39b43ad3e
--- /dev/null
+++ b/moses/PP/SourceLabelsPhraseProperty.h
@@ -0,0 +1,77 @@
+
+#pragma once
+
+#include "moses/PP/PhraseProperty.h"
+#include "util/exception.hh"
+#include <string>
+#include <list>
+
+namespace Moses
+{
+
+// Note that we require label tokens (strings) in the corresponding property values of phrase table entries
+// to be replaced beforehand by indices (size_t) of a label vocabulary. (TODO: change that?)
+
+class SourceLabelsPhrasePropertyItem
+{
+friend class SourceLabelsPhraseProperty;
+
+public:
+ SourceLabelsPhrasePropertyItem() {};
+
+ float GetSourceLabelsRHSCount() const
+ {
+ return m_sourceLabelsRHSCount;
+ };
+
+ const std::list<size_t> &GetSourceLabelsRHS() const
+ {
+ return m_sourceLabelsRHS;
+ };
+
+ const std::list< std::pair<size_t,float> > &GetSourceLabelsLHSList() const
+ {
+ return m_sourceLabelsLHSList;
+ };
+
+private:
+ float m_sourceLabelsRHSCount;
+ std::list<size_t> m_sourceLabelsRHS; // should be of size nNTs-1 (empty if initial rule, i.e. no right-hand side non-terminals)
+ std::list< std::pair<size_t,float> > m_sourceLabelsLHSList; // list of left-hand sides for this right-hand side, with counts
+};
+
+
+class SourceLabelsPhraseProperty : public PhraseProperty
+{
+public:
+ SourceLabelsPhraseProperty() {};
+
+ virtual void ProcessValue(const std::string &value);
+
+ size_t GetNumberOfNonTerminals() const {
+ return m_nNTs;
+ }
+
+ float GetTotalCount() const {
+ return m_totalCount;
+ }
+
+ const std::list<SourceLabelsPhrasePropertyItem> &GetSourceLabelItems() const {
+ return m_sourceLabelItems;
+ };
+
+ virtual const std::string *GetValueString() const {
+ UTIL_THROW2("SourceLabelsPhraseProperty: value string not available in this phrase property");
+ return NULL;
+ };
+
+protected:
+
+ size_t m_nNTs;
+ float m_totalCount;
+
+ std::list<SourceLabelsPhrasePropertyItem> m_sourceLabelItems;
+};
+
+} // namespace Moses
+
diff --git a/moses/PP/SpanLengthPhraseProperty.cpp b/moses/PP/SpanLengthPhraseProperty.cpp
new file mode 100644
index 000000000..d45c7b919
--- /dev/null
+++ b/moses/PP/SpanLengthPhraseProperty.cpp
@@ -0,0 +1,127 @@
+#include "SpanLengthPhraseProperty.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+
+using namespace std;
+
+namespace Moses
+{
+SpanLengthPhraseProperty::SpanLengthPhraseProperty()
+{
+}
+
+void SpanLengthPhraseProperty::ProcessValue(const std::string &value)
+{
+ vector<string> toks;
+ Tokenize(toks, value);
+
+ set< vector<string> > indices;
+
+ for (size_t i = 0; i < toks.size(); ++i) {
+ const string &span = toks[i];
+
+ // is it a ntIndex,sourceSpan,targetSpan or count ?
+ vector<string> toks;
+ Tokenize<string>(toks, span, ",");
+ UTIL_THROW_IF2(toks.size() != 1 && toks.size() != 3, "Incorrect format for SpanLength: " << span);
+
+ if (toks.size() == 1) {
+ float count = Scan<float>(toks[0]);
+ Populate(indices, count);
+
+ indices.clear();
+ }
+ else {
+ indices.insert(toks);
+ }
+ }
+
+ // totals
+ CalcTotals(m_source);
+ CalcTotals(m_target);
+}
+
+void SpanLengthPhraseProperty::Populate(const set< vector<string> > &indices, float count)
+{
+ set< vector<string> >::const_iterator iter;
+ for (iter = indices.begin(); iter != indices.end(); ++iter) {
+ const vector<string> &toksStr = *iter;
+ vector<size_t> toks = Scan<size_t>(toksStr);
+ UTIL_THROW_IF2(toks.size() != 3, "Incorrect format for SpanLength. Size is " << toks.size());
+
+ Populate(toks, count);
+ }
+}
+
+void SpanLengthPhraseProperty::Populate(const std::vector<size_t> &toks, float count)
+{
+ size_t ntInd = toks[0];
+ size_t sourceLength = toks[1];
+ size_t targetLength = toks[2];
+ if (ntInd >= m_source.size() ) {
+ m_source.resize(ntInd + 1);
+ m_target.resize(ntInd + 1);
+ }
+
+ Map &sourceMap = m_source[ntInd].first;
+ Map &targetMap = m_target[ntInd].first;
+ Populate(sourceMap, sourceLength, count);
+ Populate(targetMap, targetLength, count);
+}
+
+void SpanLengthPhraseProperty::Populate(Map &map, size_t span, float count)
+{
+ Map::iterator iter;
+ iter = map.find(span);
+ if (iter != map.end()) {
+ float &value = iter->second;
+ value += count;
+ }
+ else {
+ map[span] = count;
+ }
+}
+
+void SpanLengthPhraseProperty::CalcTotals(Vec &vec)
+{
+ for (size_t i = 0; i < vec.size(); ++i) {
+ float total = 0;
+
+ const Map &map = vec[i].first;
+ Map::const_iterator iter;
+ for (iter = map.begin(); iter != map.end(); ++iter) {
+ float count = iter->second;
+ total += count;
+ }
+
+ vec[i].second = total;
+ }
+}
+
+float SpanLengthPhraseProperty::GetProb(size_t ntInd, size_t sourceWidth, float smoothing) const
+{
+ float count;
+
+ const std::pair<Map, float> &data = m_source[ntInd];
+ const Map &map = data.first;
+
+ if (map.size() == 0) {
+ // should this ever be reached? there shouldn't be any span length proprty so FF shouldn't call this
+ return 1.0f;
+ }
+
+ Map::const_iterator iter = map.find(sourceWidth);
+ if (iter == map.end()) {
+ count = 0;
+ }
+ else {
+ count = iter->second;
+ }
+ count += smoothing;
+
+ float total = data.second + smoothing * (float) map.size();
+ float ret = count / total;
+ return ret;
+}
+
+}
diff --git a/moses/PP/SpanLengthPhraseProperty.h b/moses/PP/SpanLengthPhraseProperty.h
new file mode 100644
index 000000000..982c3ca0d
--- /dev/null
+++ b/moses/PP/SpanLengthPhraseProperty.h
@@ -0,0 +1,35 @@
+
+#pragma once
+
+#include <string>
+#include <set>
+#include <map>
+#include <vector>
+#include "moses/PP/PhraseProperty.h"
+
+namespace Moses
+{
+
+class SpanLengthPhraseProperty : public PhraseProperty
+{
+public:
+ SpanLengthPhraseProperty();
+
+ void ProcessValue(const std::string &value);
+
+ float GetProb(size_t ntInd, size_t sourceWidth, float smoothing) const;
+protected:
+ // fractional counts
+ typedef std::map<size_t, float> Map;
+ typedef std::vector<std::pair<Map, float> > Vec;
+ Vec m_source, m_target;
+
+ void Populate(const std::set< std::vector<std::string> > &indices, float count);
+ void Populate(const std::vector<size_t> &toks, float count);
+ void Populate(Map &map, size_t span, float count);
+
+ void CalcTotals(Vec &vec);
+};
+
+} // namespace Moses
+
diff --git a/moses/PP/TreeStructurePhraseProperty.h b/moses/PP/TreeStructurePhraseProperty.h
index f9acc38dd..45124973f 100644
--- a/moses/PP/TreeStructurePhraseProperty.h
+++ b/moses/PP/TreeStructurePhraseProperty.h
@@ -10,7 +10,7 @@ namespace Moses
class TreeStructurePhraseProperty : public PhraseProperty
{
public:
- TreeStructurePhraseProperty(const std::string &value) : PhraseProperty(value) {};
+ TreeStructurePhraseProperty() {};
};
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index f9a17fd74..36b9a955d 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -36,6 +36,7 @@ using namespace std;
namespace Moses
{
+
/** define allowed parameters */
Parameter::Parameter()
{
@@ -49,13 +50,14 @@ Parameter::Parameter()
AddParam("factor-delimiter", "fd", "specify a different factor delimiter than the default");
AddParam("input-factors", "list of factors in the input");
AddParam("input-file", "i", "location of the input file to be translated");
- AddParam("inputtype", "text (0), confusion network (1), word lattice (2) (default = 0)");
+ AddParam("inputtype", "text (0), confusion network (1), word lattice (2), tree (3) (default = 0)");
AddParam("labeled-n-best-list", "print out labels for each weight type in n-best list. default is true");
AddParam("mark-unknown", "mu", "mark unknown words in output");
AddParam("max-partial-trans-opt", "maximum number of partial translation options per input span (during mapping steps)");
AddParam("max-trans-opt-per-coverage", "maximum number of translation options per input span (after applying mapping steps)");
AddParam("max-phrase-length", "maximum phrase length (default 20)");
AddParam("n-best-list", "file and size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
+ AddParam("n-best-trees", "Write n-best target-side trees to n-best-list");
AddParam("lattice-samples", "generate samples from lattice, in same format as nbest list. Uses the file and size arguments, as in n-best-list");
AddParam("n-best-factor", "factor to compute the maximum number of contenders (=factor*nbest-size). value 0 means infinity, i.e. no threshold. default is 0");
AddParam("print-all-derivations", "to print all derivations in search graph");
@@ -102,7 +104,7 @@ Parameter::Parameter()
AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
- AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF) - the flag should be followed byy a directory name, which must exist");
+ AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF) - the flag should be followed by a directory name, which must exist");
AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder). This flag is followed by 3 values: 'true (gz|txt|bz) directory-name'");
AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
#ifdef HAVE_PROTOBUF
@@ -207,15 +209,31 @@ Parameter::Parameter()
AddParam("placeholder-factor", "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model");
AddParam("no-cache", "Disable all phrase-table caching. Default = false (ie. enable caching)");
+ AddParam("default-non-term-for-empty-range-only", "Don't add [X] to all ranges, just ranges where there isn't a source non-term. Default = false (ie. add [X] everywhere)");
+ AddParam("s2t", "Use specialized string-to-tree decoder.");
+ AddParam("s2t-parsing-algorithm", "Which S2T parsing algorithm to use. 0=recursive CYK+, 1=scope-3 (default = 0)");
- AddParam("adjacent-only", "Only allow hypotheses which are adjacent to current derivation. ITG without block moves");
-
+ AddParam("spe-src", "Simulated post-editing. Source filename");
+ AddParam("spe-trg", "Simulated post-editing. Target filename");
+ AddParam("spe-aln", "Simulated post-editing. Alignment filename");
}
Parameter::~Parameter()
{
}
+const PARAM_VEC *Parameter::GetParam(const std::string &paramName) const
+{
+ PARAM_MAP::const_iterator iter = m_setting.find( paramName );
+ if (iter == m_setting.end()) {
+ return NULL;
+ }
+ else {
+ return &iter->second;
+ }
+
+}
+
/** initialize a parameter, sub of constructor */
void Parameter::AddParam(const string &paramName, const string &description)
{
@@ -324,28 +342,29 @@ bool Parameter::LoadParam(int argc, char* argv[])
}
// don't mix old and new format
- if ((isParamSpecified("feature") || isParamSpecified("weight"))
- && (isParamSpecified("weight-slm") || isParamSpecified("weight-bl") || isParamSpecified("weight-d") ||
- isParamSpecified("weight-dlm") || isParamSpecified("weight-lrl") || isParamSpecified("weight-generation") ||
- isParamSpecified("weight-i") || isParamSpecified("weight-l") || isParamSpecified("weight-lex") ||
- isParamSpecified("weight-glm") || isParamSpecified("weight-wt") || isParamSpecified("weight-pp") ||
- isParamSpecified("weight-pb") || isParamSpecified("weight-t") || isParamSpecified("weight-w") ||
- isParamSpecified("weight-p") ||
- isParamSpecified("weight-u") || isParamSpecified("weight-e") ||
- isParamSpecified("dlm-mode") || isParamSpecified("generation-file") || isParamSpecified("global-lexical-file") ||
- isParamSpecified("glm-feature") || isParamSpecified("lmodel-file") || isParamSpecified("lmodel-dub") ||
- isParamSpecified("slmodel-file") || isParamSpecified("slmodel-factor") ||
- isParamSpecified("slmodel-beam") || isParamSpecified("ttable-file") || isParamSpecified("phrase-pair-feature") ||
- isParamSpecified("phrase-boundary-source-feature") || isParamSpecified("phrase-boundary-target-feature") || isParamSpecified("phrase-length-feature") ||
- isParamSpecified("target-word-insertion-feature") || isParamSpecified("source-word-deletion-feature") || isParamSpecified("word-translation-feature")
+ if ((GetParam("feature") || GetParam("weight"))
+ && (GetParam("weight-slm") || GetParam("weight-bl") || GetParam("weight-d") ||
+ GetParam("weight-dlm") || GetParam("weight-lrl") || GetParam("weight-generation") ||
+ GetParam("weight-i") || GetParam("weight-l") || GetParam("weight-lex") ||
+ GetParam("weight-glm") || GetParam("weight-wt") || GetParam("weight-pp") ||
+ GetParam("weight-pb") || GetParam("weight-t") || GetParam("weight-w") ||
+ GetParam("weight-p") ||
+ GetParam("weight-u") || GetParam("weight-e") ||
+ GetParam("dlm-mode") || GetParam("generation-file") || GetParam("global-lexical-file") ||
+ GetParam("glm-feature") || GetParam("lmodel-file") || GetParam("lmodel-dub") ||
+ GetParam("slmodel-file") || GetParam("slmodel-factor") ||
+ GetParam("slmodel-beam") || GetParam("ttable-file") || GetParam("phrase-pair-feature") ||
+ GetParam("phrase-boundary-source-feature") || GetParam("phrase-boundary-target-feature") || GetParam("phrase-length-feature") ||
+ GetParam("target-word-insertion-feature") || GetParam("source-word-deletion-feature") || GetParam("word-translation-feature")
)
) {
UTIL_THROW(util::Exception, "Don't mix old and new ini file format");
}
// convert old weights args to new format
- if (!isParamSpecified("feature"))
+ if (GetParam("feature") == NULL) {
ConvertWeightArgs();
+ }
CreateWeightsMap();
WeightOverwrite();
@@ -370,19 +389,16 @@ bool Parameter::LoadParam(int argc, char* argv[])
void Parameter::AddFeaturesCmd()
{
- if (!isParamSpecified("feature-add")) {
- return;
+ const PARAM_VEC *params = GetParam("feature-add");
+ if (params) {
+ PARAM_VEC::const_iterator iter;
+ for (iter = params->begin(); iter != params->end(); ++iter) {
+ const string &line = *iter;
+ AddFeature(line);
+ }
+
+ m_setting.erase("feature-add");
}
-
- const PARAM_VEC &params = GetParam("feature-add");
-
- PARAM_VEC::const_iterator iter;
- for (iter = params.begin(); iter != params.end(); ++iter) {
- const string &line = *iter;
- AddFeature(line);
- }
-
- m_setting.erase("feature-add");
}
std::vector<float> Parameter::GetWeights(const std::string &name)
@@ -456,9 +472,12 @@ void Parameter::ConvertWeightArgsSingleWeight(const string &oldWeightName, const
void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
{
+ const PARAM_VEC *params;
+
// process input weights 1st
- if (isParamSpecified("weight-i")) {
- vector<float> inputWeights = Scan<float>(m_setting["weight-i"]);
+ params = GetParam("weight-i");
+ if (params) {
+ vector<float> inputWeights = Scan<float>(*params);
PARAM_VEC &numInputScores = m_setting["input-scores"];
if (inputWeights.size() == 1) {
UTIL_THROW_IF2(numInputScores.size() != 0, "No [input-scores] section allowed");
@@ -480,19 +499,26 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
size_t numRealWordsInInput = 0;
map<string, size_t> ptIndices;
- if (GetParam("input-scores").size()) {
- numInputScores = Scan<size_t>(GetParam("input-scores")[0]);
- }
+ params = GetParam("input-scores");
+ if (params) {
+ numInputScores = Scan<size_t>(params->at(0));
- if (GetParam("input-scores").size() > 1) {
- numRealWordsInInput = Scan<size_t>(GetParam("input-scores")[1]);
+ if (params->size() > 1) {
+ numRealWordsInInput = Scan<size_t>(params->at(1));
+ }
}
// load phrase translation tables
- if (GetParam("ttable-file").size() > 0) {
+ params = GetParam("ttable-file");
+ if (params) {
// weights
- const vector<string> &translationVector = GetParam("ttable-file");
- vector<size_t> maxTargetPhrase = Scan<size_t>(GetParam("ttable-limit"));
+ const vector<string> translationVector = *params;
+
+ vector<size_t> maxTargetPhrase;
+ params = GetParam("ttable-limit");
+ if (params) {
+ maxTargetPhrase = Scan<size_t>(*params);
+ }
if(maxTargetPhrase.size() == 1 && translationVector.size() > 1) {
VERBOSE(1, "Using uniform ttable-limit of " << maxTargetPhrase[0] << " for all translation tables." << endl);
@@ -520,32 +546,32 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
}
UTIL_THROW_IF2(token.size() < 5, "Phrase table must have at least 5 scores");
- PhraseTableImplementation implementation = (PhraseTableImplementation) Scan<int>(token[0]);
+ int implementation = Scan<int>(token[0]);
string ptType;
switch (implementation) {
- case Memory:
+ case 0: // Memory
ptType = "PhraseDictionaryMemory";
break;
- case Binary:
+ case 1: // Binary
ptType = "PhraseDictionaryBinary";
break;
- case OnDisk:
+ case 2: // OnDisk
ptType = "PhraseDictionaryOnDisk";
break;
- case SCFG:
+ case 6: // SCFG
ptType = "PhraseDictionaryMemory";
break;
- case Compact:
+ case 12: // Compact
ptType = "PhraseDictionaryCompact";
break;
- case SuffixArray:
+ case 8: // SuffixArray
ptType = "PhraseDictionarySuffixArray";
break;
- case DSuffixArray:
+ case 14: // DSuffixArray
ptType = "PhraseDictionaryDynSuffixArray";
break;
- case DCacheBased:
+ case 15: // DCacheBased:
ptType = "PhraseDictionaryDynamicCacheBased";
break;
default:
@@ -601,7 +627,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
ptLine << "num-features=" << numScoreComponent << " ";
ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";
- if (implementation == SuffixArray || implementation == DSuffixArray) {
+ if (implementation == 8 || implementation == 14) {
ptLine << "target-path=" << token[5] << " ";
ptLine << "alignment-path=" << token[6] << " ";
}
@@ -629,37 +655,36 @@ void Parameter::ConvertWeightArgsDistortion()
const string oldLexReordingName = "distortion-file";
// distortion / lex distortion
- const PARAM_VEC &oldWeights = GetParam(oldWeightName);
+ const PARAM_VEC *oldWeights = GetParam(oldWeightName);
- if (oldWeights.size() > 0) {
- if (!isParamSpecified("search-algorithm") ||
- (GetParam("search-algorithm").size() > 0
- && (Trim(GetParam("search-algorithm")[0]) == "0"
- ||Trim(GetParam("search-algorithm")[0]) == "1"
- )
+ if (oldWeights) {
+ const PARAM_VEC *searchAlgo = GetParam("search-algorithm");
+ if (searchAlgo == NULL ||
+ (searchAlgo->size() > 0
+ && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1")
)
) {
// phrase-based. Add distance distortion to list of features
AddFeature("Distortion");
- SetWeight("Distortion", 0, Scan<float>(oldWeights[0]));
+ SetWeight("Distortion", 0, Scan<float>(oldWeights->at(0)));
}
// everything but the last is lex reordering model
size_t currOldInd = 1;
- const PARAM_VEC &lextable = GetParam(oldLexReordingName);
+ const PARAM_VEC *lextable = GetParam(oldLexReordingName);
- for (size_t indTable = 0; indTable < lextable.size(); ++indTable) {
- const string &line = lextable[indTable];
+ for (size_t indTable = 0; lextable && indTable < lextable->size(); ++indTable) {
+ const string &line = lextable->at(indTable);
vector<string> toks = Tokenize(line);
size_t numFF = Scan<size_t>(toks[2]);
vector<float> weights(numFF);
for (size_t currFF = 0; currFF < numFF; ++currFF) {
- UTIL_THROW_IF2(currOldInd >= oldWeights.size(),
- "Errors converting old distortion weights to new weights");
- float weight = Scan<float>(oldWeights[currOldInd]);
+ UTIL_THROW_IF2(oldWeights && currOldInd >= oldWeights->size(),
+ "Errors converting old distortion weights to new weights");
+ float weight = Scan<float>(oldWeights->at(currOldInd));
weights[currFF] = weight;
++currOldInd;
@@ -693,21 +718,23 @@ void Parameter::ConvertWeightArgsLM()
{
const string oldWeightName = "weight-l";
const string oldFeatureName = "lmodel-file";
+ const PARAM_VEC *params;
bool isChartDecoding = true;
- if (!isParamSpecified("search-algorithm") ||
- (GetParam("search-algorithm").size() > 0
- && (Trim(GetParam("search-algorithm")[0]) == "0"
- ||Trim(GetParam("search-algorithm")[0]) == "1"
- )
+
+ params = GetParam("search-algorithm");
+ if (params == NULL ||
+ (params->size() > 0
+ && (Trim(params->at(0)) == "0" || Trim(params->at(0)) == "1")
)
) {
isChartDecoding = false;
}
vector<int> oovWeights;
- if (isParamSpecified("lmodel-oov-feature")) {
- oovWeights = Scan<int>(m_setting["lmodel-oov-feature"]);
+ params = GetParam("lmodel-oov-feature");
+ if (params) {
+ oovWeights = Scan<int>(*params);
}
PARAM_MAP::iterator iterMap;
@@ -827,11 +854,10 @@ void Parameter::ConvertWeightArgsWordPenalty()
const std::string newWeightName = "WordPenalty";
bool isChartDecoding = true;
- if (!isParamSpecified("search-algorithm") ||
- (GetParam("search-algorithm").size() > 0
- && (Trim(GetParam("search-algorithm")[0]) == "0"
- ||Trim(GetParam("search-algorithm")[0]) == "1"
- )
+ const PARAM_VEC *searchAlgo = GetParam("search-algorithm");
+ if (searchAlgo == NULL ||
+ (searchAlgo->size() > 0
+ && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1")
)
) {
isChartDecoding = false;
@@ -858,10 +884,11 @@ void Parameter::ConvertWeightArgsWordPenalty()
void Parameter::ConvertPhrasePenalty()
{
string oldWeightName = "weight-p";
- if (isParamSpecified(oldWeightName)) {
- UTIL_THROW_IF2(m_setting[oldWeightName].size() != 1,
- "There should be only 1 phrase-penalty weight");
- float weight = Scan<float>(m_setting[oldWeightName][0]);
+ const PARAM_VEC *params = GetParam(oldWeightName);
+ if (params) {
+ UTIL_THROW_IF2(params->size() != 1,
+ "There should be only 1 phrase-penalty weight");
+ float weight = Scan<float>(params->at(0));
AddFeature("PhrasePenalty");
SetWeight("PhrasePenalty", 0, weight);
@@ -1371,6 +1398,27 @@ void Parameter::Save(const std::string path)
file.close();
}
+template<>
+void Parameter::SetParameter<bool>(bool &parameter, const std::string &parameterName, const bool &defaultValue) const
+{
+ const PARAM_VEC *params = GetParam(parameterName);
+
+ // default value if nothing is specified
+ parameter = defaultValue;
+ if (params == NULL) {
+ return;
+ }
+
+ // if parameter is just specified as, e.g. "-parameter" set it true
+ if (params->size() == 0) {
+ parameter = true;
+ }
+ // if paramter is specified "-parameter true" or "-parameter false"
+ else if (params->size() == 1) {
+ parameter = Scan<bool>( params->at(0));
+ }
}
+} // namespace
+
diff --git a/moses/Parameter.h b/moses/Parameter.h
index ef2f097e8..9a290f7c4 100644
--- a/moses/Parameter.h
+++ b/moses/Parameter.h
@@ -27,6 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <map>
#include <vector>
#include "TypeDef.h"
+#include "Util.h"
namespace Moses
{
@@ -91,31 +92,15 @@ public:
void Explain();
/** return a vector of strings holding the whitespace-delimited values on the ini-file line corresponding to the given parameter name */
- const PARAM_VEC &GetParam(const std::string &paramName) {
- return m_setting[paramName];
- }
+ const PARAM_VEC *GetParam(const std::string &paramName) const;
+
/** check if parameter is defined (either in moses.ini or as switch) */
- bool isParamSpecified(const std::string &paramName) {
+ bool isParamSpecified(const std::string &paramName) const {
return m_setting.find( paramName ) != m_setting.end();
}
- const std::string GetFullName(std::string abbr) {
- return m_fullname[abbr];
- }
-
- const std::string GetAbbreviation(std::string full) {
- return m_abbreviation[full];
- }
- const PARAM_VEC &GetParamShortName(const std::string &paramName) {
- return GetParam(GetFullName(paramName));
- }
-
void OverwriteParam(const std::string &paramName, PARAM_VEC values);
- void OverwriteParamShortName(const std::string &paramShortName, PARAM_VEC values) {
- OverwriteParam(GetFullName(paramShortName),values);
- }
-
std::vector<float> GetWeights(const std::string &name);
std::map<std::string, std::vector<float> > GetAllWeights() const {
return m_weights;
@@ -127,8 +112,24 @@ public:
}
void Save(const std::string path);
+
+ template<typename T>
+ void SetParameter(T &var, const std::string &name, const T &defaultValue) const
+ {
+ const PARAM_VEC *params = GetParam(name);
+ if (params && params->size()) {
+ var = Scan<T>( params->at(0));
+ }
+ else {
+ var = defaultValue;
+ }
+ }
+
};
+template<>
+void Parameter::SetParameter<bool>(bool &var, const std::string &name, const bool &defaultValue) const;
+
}
#endif
diff --git a/moses/Phrase.h b/moses/Phrase.h
index e910cbb2a..947e50905 100644
--- a/moses/Phrase.h
+++ b/moses/Phrase.h
@@ -47,8 +47,8 @@ class WordsRange;
class Phrase
{
friend std::ostream& operator<<(std::ostream&, const Phrase&);
-private:
-
+ // private:
+protected:
std::vector<Word> m_words;
public:
diff --git a/moses/PrefixTree.h b/moses/PrefixTree.h
index f7a869e49..c65daa691 100644
--- a/moses/PrefixTree.h
+++ b/moses/PrefixTree.h
@@ -172,7 +172,8 @@ public:
ptr.clear();
ptr.resize(keys.size());
std::vector<OFF_T> rawOffs(keys.size());
- fread(&rawOffs[0], sizeof(OFF_T), keys.size(), f);
+ size_t bytes_read = fread(&rawOffs[0], sizeof(OFF_T), keys.size(), f);
+ UTIL_THROW_IF2(bytes_read != keys.size(), "Read error at " << HERE);
for(size_t i=0; i<ptr.size(); ++i)
if (rawOffs[i]) ptr[i].set(f, rawOffs[i]);
}
diff --git a/moses/RuleCubeItem.cpp b/moses/RuleCubeItem.cpp
index 4525d059e..970bac94d 100644
--- a/moses/RuleCubeItem.cpp
+++ b/moses/RuleCubeItem.cpp
@@ -79,7 +79,7 @@ void RuleCubeItem::CreateHypothesis(const ChartTranslationOptions &transOpt,
ChartManager &manager)
{
m_hypothesis = new ChartHypothesis(transOpt, *this, manager);
- m_hypothesis->Evaluate();
+ m_hypothesis->EvaluateWhenApplied();
m_score = m_hypothesis->GetTotalScore();
}
diff --git a/moses/ScoreComponentCollection.cpp b/moses/ScoreComponentCollection.cpp
index e252d1a7a..eedaa589e 100644
--- a/moses/ScoreComponentCollection.cpp
+++ b/moses/ScoreComponentCollection.cpp
@@ -179,25 +179,31 @@ void ScoreComponentCollection::SparseL2Regularize(float lambda)
m_scores.sparseL2regularize(lambda);
}
-void ScoreComponentCollection::Save(ostream& out) const
+void ScoreComponentCollection::Save(ostream& out, bool multiline) const
{
+ string sep = " ";
+ string linesep = "\n";
+ if (!multiline) {
+ sep = "=";
+ linesep = " ";
+ }
ScoreIndexMap::const_iterator iter = s_scoreIndexes.begin();
for (; iter != s_scoreIndexes.end(); ++iter ) {
string name = iter->first->GetScoreProducerDescription();
IndexPair ip = iter->second; // feature indices
if (ip.second-ip.first == 1) {
- out << name << " " << m_scores[ip.first] << endl;
+ out << name << sep << m_scores[ip.first] << linesep;
} else {
for (size_t i=ip.first; i < ip.second; ++i) {
ostringstream fullname;
fullname << name << "_" << (i + 1 - ip.first);
- out << fullname.str() << " " << m_scores[i] << endl;
+ out << fullname.str() << sep << m_scores[i] << linesep;
}
}
}
// write sparse features
- m_scores.write(out);
+ m_scores.write(out,sep,linesep);
}
void ScoreComponentCollection::Save(const string& filename) const
@@ -214,7 +220,7 @@ void ScoreComponentCollection::Save(const string& filename) const
void
ScoreComponentCollection::
-Assign(const FeatureFunction* sp, const string line)
+Assign(const FeatureFunction* sp, const string &line)
{
istringstream istr(line);
while(istr) {
diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h
index 09de115f9..802d00002 100644
--- a/moses/ScoreComponentCollection.h
+++ b/moses/ScoreComponentCollection.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
// $Id$
/***********************************************************************
@@ -93,10 +94,13 @@ class ScoreComponentCollection
private:
FVector m_scores;
+public:
typedef std::pair<size_t,size_t> IndexPair;
+private:
typedef std::map<const FeatureFunction*,IndexPair> ScoreIndexMap;
static ScoreIndexMap s_scoreIndexes;
static size_t s_denseVectorSize;
+public:
static IndexPair GetIndexes(const FeatureFunction* sp) {
ScoreIndexMap::const_iterator indexIter = s_scoreIndexes.find(sp);
if (indexIter == s_scoreIndexes.end()) {
@@ -257,12 +261,21 @@ public:
void PlusEquals(const FeatureFunction* sp, const ScorePair &scorePair);
+ // Add score by index
+ void PlusEquals(size_t index, float score) {
+ m_scores[index] += score;
+ }
+
//For features which have an unbounded number of components
void SparsePlusEquals(const std::string& full_name, float score) {
FName fname(full_name);
m_scores[fname] += score;
}
+ void SparsePlusEquals(const FName& fname, float score) {
+ m_scores[fname] += score;
+ }
+
void Assign(const FeatureFunction* sp, const std::vector<float>& scores);
//! Special version Assign(ScoreProducer, vector<float>)
@@ -275,7 +288,7 @@ public:
m_scores[indexes.first] = score;
}
- // Assign core weight by index
+ // Assign score by index
void Assign(size_t index, float score) {
m_scores[index] = score;
}
@@ -287,7 +300,7 @@ public:
//Read sparse features from string
- void Assign(const FeatureFunction* sp, const std::string line);
+ void Assign(const FeatureFunction* sp, const std::string &line);
// shortcut: setting the value directly using the feature name
void Assign(const std::string name, float score) {
@@ -346,6 +359,11 @@ public:
m_scores.capMin(minValue);
}
+ std::pair<size_t,size_t> GetIndexesForProducer(const FeatureFunction* sp) const {
+ IndexPair indexPair = GetIndexes(sp);
+ return indexPair;
+ }
+
//! if a FeatureFunction produces a single score (for example, a language model score)
//! this will return it. If not, this method will throw
float GetScoreForProducer(const FeatureFunction* sp) const {
@@ -375,7 +393,7 @@ public:
size_t SparseL1Regularize(float lambda);
void SparseL2Regularize(float lambda);
void Save(const std::string& filename) const;
- void Save(std::ostream&) const;
+ void Save(std::ostream&, bool multiline=true) const;
void IncrementSparseHopeFeatures() {
m_scores.incrementSparseHopeFeatures();
diff --git a/moses/ScoreComponentCollectionTest.cpp b/moses/ScoreComponentCollectionTest.cpp
index de542d1f6..a238d66b8 100644
--- a/moses/ScoreComponentCollectionTest.cpp
+++ b/moses/ScoreComponentCollectionTest.cpp
@@ -34,16 +34,16 @@ class MockStatelessFeatureFunction : public StatelessFeatureFunction
public:
MockStatelessFeatureFunction(size_t n, const string &line) :
StatelessFeatureFunction(n, line) {}
- void Evaluate(const Hypothesis&, ScoreComponentCollection*) const {}
- void EvaluateChart(const ChartHypothesis&, ScoreComponentCollection*) const {}
- void Evaluate(const InputType &input
+ void EvaluateWhenApplied(const Hypothesis&, ScoreComponentCollection*) const {}
+ void EvaluateWhenApplied(const ChartHypothesis&, ScoreComponentCollection*) const {}
+ void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore) const
{}
- void Evaluate(const Phrase &source
+ void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
diff --git a/moses/SearchCubePruning.cpp b/moses/SearchCubePruning.cpp
index eeb11d164..b8382eadd 100644
--- a/moses/SearchCubePruning.cpp
+++ b/moses/SearchCubePruning.cpp
@@ -86,7 +86,7 @@ void SearchCubePruning::ProcessSentence()
// go through each stack
size_t stackNo = 1;
std::vector < HypothesisStack* >::iterator iterStack;
- for (iterStack = ++m_hypoStackColl.begin() ; iterStack != m_hypoStackColl.end() ; ++iterStack) {
+ for (iterStack = m_hypoStackColl.begin() + 1 ; iterStack != m_hypoStackColl.end() ; ++iterStack) {
// check if decoding ran out of time
double _elapsed_time = GetUserTime();
if (_elapsed_time > staticData.GetTimeoutThreshold()) {
@@ -250,11 +250,6 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
return true;
}
- if (StaticData::Instance().AdjacentOnly() &&
- !hypoBitmap.IsAdjacent(range.GetStartPos(), range.GetEndPos())) {
- return false;
- }
-
bool leftMostEdge = (hypoFirstGapPos == startPos);
// any length extension is okay if starting at left-most edge
if (leftMostEdge) {
diff --git a/moses/SearchNormal.cpp b/moses/SearchNormal.cpp
index 0df179e13..80ff37703 100644
--- a/moses/SearchNormal.cpp
+++ b/moses/SearchNormal.cpp
@@ -93,7 +93,9 @@ void SearchNormal::ProcessSentence()
// this stack is fully expanded;
actual_hypoStack = &sourceHypoColl;
+
}
+ //OutputHypoStack();
}
@@ -253,11 +255,6 @@ void SearchNormal::ExpandAllHypotheses(const Hypothesis &hypothesis, size_t star
expectedScore += m_transOptColl.GetFutureScore().CalcFutureScore( hypothesis.GetWordsBitmap(), startPos, endPos );
}
- if (StaticData::Instance().AdjacentOnly() &&
- !hypothesis.GetWordsBitmap().IsAdjacent(startPos, endPos)) {
- return;
- }
-
// loop through all translation options
const TranslationOptionList &transOptList = m_transOptColl.GetTranslationOptionList(WordsRange(startPos, endPos));
TranslationOptionList::const_iterator iter;
@@ -291,7 +288,7 @@ void SearchNormal::ExpandHypothesis(const Hypothesis &hypothesis, const Translat
stats.StopTimeBuildHyp();
}
if (newHypo==NULL) return;
- newHypo->Evaluate(m_transOptColl.GetFutureScore());
+ newHypo->EvaluateWhenApplied(m_transOptColl.GetFutureScore());
} else
// early discarding: check if hypothesis is too bad to build
{
@@ -386,4 +383,15 @@ void SearchNormal::OutputHypoStackSize()
TRACE_ERR( endl);
}
+void SearchNormal::OutputHypoStack()
+{
+ // all stacks
+ int i = 0;
+ vector < HypothesisStack* >::iterator iterStack;
+ for (iterStack = m_hypoStackColl.begin() ; iterStack != m_hypoStackColl.end() ; ++iterStack) {
+ HypothesisStackNormal &hypoColl = *static_cast<HypothesisStackNormal*>(*iterStack);
+ TRACE_ERR( "Stack " << i++ << ": " << endl << hypoColl << endl);
+ }
+}
+
}
diff --git a/moses/SearchNormal.h b/moses/SearchNormal.h
index 49a0bae9d..d76e102c2 100644
--- a/moses/SearchNormal.h
+++ b/moses/SearchNormal.h
@@ -39,7 +39,7 @@ public:
void ProcessSentence();
void OutputHypoStackSize();
- void OutputHypoStack(int stack);
+ void OutputHypoStack();
virtual const std::vector < HypothesisStack* >& GetHypothesisStacks() const;
virtual const Hypothesis *GetBestHypothesis() const;
diff --git a/moses/SearchNormalBatch.cpp b/moses/SearchNormalBatch.cpp
index 055d85ed9..9f41f4c25 100644
--- a/moses/SearchNormalBatch.cpp
+++ b/moses/SearchNormalBatch.cpp
@@ -159,13 +159,13 @@ void SearchNormalBatch::EvalAndMergePartialHypos()
++sfff_iter) {
const StatefulFeatureFunction &ff = *(sfff_iter->second);
int state_idx = sfff_iter->first;
- hypo->EvaluateWith(ff, state_idx);
+ hypo->EvaluateWhenApplied(ff, state_idx);
}
std::vector<const StatelessFeatureFunction*>::iterator slff_iter;
for (slff_iter = m_stateless_ffs.begin();
slff_iter != m_stateless_ffs.end();
++slff_iter) {
- hypo->EvaluateWith(**slff_iter);
+ hypo->EvaluateWhenApplied(**slff_iter);
}
}
@@ -190,7 +190,7 @@ void SearchNormalBatch::EvalAndMergePartialHypos()
dlm_iter != m_dlm_ffs.end();
++dlm_iter) {
LanguageModel &lm = *(dlm_iter->second);
- hypo->EvaluateWith(lm, (*dlm_iter).first);
+ hypo->EvaluateWhenApplied(lm, (*dlm_iter).first);
}
// Put completed hypothesis onto its stack.
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 2d19842da..b4b92581e 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -1,5 +1,5 @@
// $Id$
-// vim:tabstop=2
+// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
@@ -61,14 +61,12 @@ StaticData StaticData::s_instance;
StaticData::StaticData()
:m_sourceStartPosMattersForRecombination(false)
,m_inputType(SentenceInput)
- ,m_detailedTranslationReportingFilePath()
- ,m_detailedTreeFragmentsTranslationReportingFilePath()
,m_onlyDistinctNBest(false)
,m_needAlignmentInfo(false)
- ,m_factorDelimiter("|") // default delimiter between factors
,m_lmEnableOOVFeature(false)
,m_isAlwaysCreateDirectTranslationOption(false)
,m_currentWeightSetting("default")
+ ,m_useS2TDecoder(false)
,m_treeStructure(NULL)
{
m_xmlBrackets.first="<";
@@ -105,22 +103,20 @@ bool StaticData::LoadData(Parameter *parameter)
ResetUserTime();
m_parameter = parameter;
+ const PARAM_VEC *params;
+
// verbose level
- m_verboseLevel = 1;
- if (m_parameter->GetParam("verbose").size() == 1) {
- m_verboseLevel = Scan<size_t>( m_parameter->GetParam("verbose")[0]);
- }
+ m_parameter->SetParameter(m_verboseLevel, "verbose", (size_t) 1);
// to cube or not to cube
- m_searchAlgorithm = (m_parameter->GetParam("search-algorithm").size() > 0) ?
- (SearchAlgorithm) Scan<size_t>(m_parameter->GetParam("search-algorithm")[0]) : Normal;
+ m_parameter->SetParameter(m_searchAlgorithm, "search-algorithm", Normal);
if (IsChart())
LoadChartDecodingParameters();
// input type has to be specified BEFORE loading the phrase tables!
- if(m_parameter->GetParam("inputtype").size())
- m_inputType= (InputTypeEnum) Scan<int>(m_parameter->GetParam("inputtype")[0]);
+ m_parameter->SetParameter(m_inputType, "inputtype", SentenceInput);
+
std::string s_it = "text input";
if (m_inputType == 1) {
s_it = "confusion net";
@@ -128,97 +124,104 @@ bool StaticData::LoadData(Parameter *parameter)
if (m_inputType == 2) {
s_it = "word lattice";
}
+ if (m_inputType == 3) {
+ s_it = "tree";
+ }
VERBOSE(2,"input type is: "<<s_it<<"\n");
- if(m_parameter->GetParam("recover-input-path").size()) {
- m_recoverPath = Scan<bool>(m_parameter->GetParam("recover-input-path")[0]);
- if (m_recoverPath && m_inputType == SentenceInput) {
+ m_parameter->SetParameter(m_recoverPath, "recover-input-path", false);
+ if (m_recoverPath && m_inputType == SentenceInput) {
TRACE_ERR("--recover-input-path should only be used with confusion net or word lattice input!\n");
m_recoverPath = false;
}
- }
// factor delimiter
- if (m_parameter->GetParam("factor-delimiter").size() > 0) {
- m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
- if (m_factorDelimiter == "none")
+ m_parameter->SetParameter<string>(m_factorDelimiter, "factor-delimiter", "|");
+ if (m_factorDelimiter == "none") {
m_factorDelimiter = "";
}
- SetBooleanParameter( &m_continuePartialTranslation, "continue-partial-translation", false );
- SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
+ m_parameter->SetParameter( m_continuePartialTranslation, "continue-partial-translation", false );
+ m_parameter->SetParameter( m_outputHypoScore, "output-hypo-score", false );
//word-to-word alignment
// alignments
- SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
+ m_parameter->SetParameter( m_PrintAlignmentInfo, "print-alignment-info", false );
if (m_PrintAlignmentInfo) {
m_needAlignmentInfo = true;
}
- if(m_parameter->GetParam("sort-word-alignment").size()) {
- m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
- }
+ m_parameter->SetParameter(m_wordAlignmentSort, "sort-word-alignment", NoSort);
- SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
+ m_parameter->SetParameter( m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
if (m_PrintAlignmentInfoNbest) {
m_needAlignmentInfo = true;
}
- if (m_parameter->GetParam("alignment-output-file").size() > 0) {
- m_alignmentOutputFile = Scan<std::string>(m_parameter->GetParam("alignment-output-file")[0]);
+ params = m_parameter->GetParam("alignment-output-file");
+ if (params && params->size()) {
+ m_alignmentOutputFile = Scan<std::string>(params->at(0));
m_needAlignmentInfo = true;
}
- SetBooleanParameter( &m_PrintID, "print-id", false );
- SetBooleanParameter( &m_PrintPassthroughInformation, "print-passthrough", false );
- SetBooleanParameter( &m_PrintPassthroughInformationInNBest, "print-passthrough-in-n-best", false );
+ m_parameter->SetParameter( m_PrintID, "print-id", false );
+ m_parameter->SetParameter( m_PrintPassthroughInformation, "print-passthrough", false );
+ m_parameter->SetParameter( m_PrintPassthroughInformationInNBest, "print-passthrough-in-n-best", false );
// n-best
- if (m_parameter->GetParam("n-best-list").size() >= 2) {
- m_nBestFilePath = m_parameter->GetParam("n-best-list")[0];
- m_nBestSize = Scan<size_t>( m_parameter->GetParam("n-best-list")[1] );
- m_onlyDistinctNBest=(m_parameter->GetParam("n-best-list").size()>2
- && m_parameter->GetParam("n-best-list")[2]=="distinct");
- } else if (m_parameter->GetParam("n-best-list").size() == 1) {
- UserMessage::Add(string("wrong format for switch -n-best-list file size"));
- return false;
+ params = m_parameter->GetParam("n-best-list");
+ if (params) {
+ if (params->size() >= 2) {
+ m_nBestFilePath = params->at(0);
+ m_nBestSize = Scan<size_t>( params->at(1) );
+ m_onlyDistinctNBest=(params->size()>2 && params->at(2)=="distinct");
+ }
+ else {
+ UserMessage::Add(string("wrong format for switch -n-best-list file size [disinct]"));
+ return false;
+ }
} else {
m_nBestSize = 0;
}
- if (m_parameter->GetParam("n-best-factor").size() > 0) {
- m_nBestFactor = Scan<size_t>( m_parameter->GetParam("n-best-factor")[0]);
- } else {
- m_nBestFactor = 20;
- }
+
+ m_parameter->SetParameter<size_t>(m_nBestFactor, "n-best-factor", 20);
//lattice samples
- if (m_parameter->GetParam("lattice-samples").size() ==2 ) {
- m_latticeSamplesFilePath = m_parameter->GetParam("lattice-samples")[0];
- m_latticeSamplesSize = Scan<size_t>(m_parameter->GetParam("lattice-samples")[1]);
- } else if (m_parameter->GetParam("lattice-samples").size() != 0 ) {
- UserMessage::Add(string("wrong format for switch -lattice-samples file size"));
- return false;
- } else {
+ params = m_parameter->GetParam("lattice-samples");
+ if (params) {
+ if (params->size() ==2 ) {
+ m_latticeSamplesFilePath = params->at(0);
+ m_latticeSamplesSize = Scan<size_t>(params->at(1));
+ }
+ else {
+ UserMessage::Add(string("wrong format for switch -lattice-samples file size"));
+ return false;
+ }
+ }
+ else {
m_latticeSamplesSize = 0;
}
// word graph
- if (m_parameter->GetParam("output-word-graph").size() == 2)
- m_outputWordGraph = true;
+ params = m_parameter->GetParam("output-word-graph");
+ if (params && params->size() == 2)
+ m_outputWordGraph = true;
else
- m_outputWordGraph = false;
+ m_outputWordGraph = false;
// search graph
- if (m_parameter->GetParam("output-search-graph").size() > 0) {
- if (m_parameter->GetParam("output-search-graph").size() != 1) {
+ params = m_parameter->GetParam("output-search-graph");
+ if (params && params->size()) {
+ if (params->size() != 1) {
UserMessage::Add(string("ERROR: wrong format for switch -output-search-graph file"));
return false;
}
m_outputSearchGraph = true;
}
// ... in extended format
- else if (m_parameter->GetParam("output-search-graph-extended").size() > 0) {
- if (m_parameter->GetParam("output-search-graph-extended").size() != 1) {
+ else if (m_parameter->GetParam("output-search-graph-extended") &&
+ m_parameter->GetParam("output-search-graph-extended")->size()) {
+ if (m_parameter->GetParam("output-search-graph-extended")->size() != 1) {
UserMessage::Add(string("ERROR: wrong format for switch -output-search-graph-extended file"));
return false;
}
@@ -227,19 +230,25 @@ bool StaticData::LoadData(Parameter *parameter)
} else {
m_outputSearchGraph = false;
}
- if (m_parameter->GetParam("output-search-graph-slf").size() > 0) {
+
+ params = m_parameter->GetParam("output-search-graph-slf");
+ if (params && params->size()) {
m_outputSearchGraphSLF = true;
} else {
m_outputSearchGraphSLF = false;
}
- if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) {
+
+ params = m_parameter->GetParam("output-search-graph-hypergraph");
+ if (params && params->size()) {
m_outputSearchGraphHypergraph = true;
} else {
m_outputSearchGraphHypergraph = false;
}
+
#ifdef HAVE_PROTOBUF
- if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
- if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {
+ params = m_parameter->GetParam("output-search-graph-pb");
+ if (params && params->size()) {
+ if (params->size() != 1) {
UserMessage::Add(string("ERROR: wrong format for switch -output-search-graph-pb path"));
return false;
}
@@ -247,49 +256,41 @@ bool StaticData::LoadData(Parameter *parameter)
} else
m_outputSearchGraphPB = false;
#endif
- SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", false );
- SetBooleanParameter( &m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false );
- if (m_parameter->isParamSpecified("output-unknowns")) {
+ m_parameter->SetParameter( m_unprunedSearchGraph, "unpruned-search-graph", false );
+ m_parameter->SetParameter( m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false );
- if (m_parameter->GetParam("output-unknowns").size() == 1) {
- m_outputUnknownsFile =Scan<string>(m_parameter->GetParam("output-unknowns")[0]);
- } else {
- UserMessage::Add(string("need to specify exactly one file name for unknowns"));
- return false;
- }
- }
+ m_parameter->SetParameter<string>(m_outputUnknownsFile, "output-unknowns", "");
// include feature names in the n-best list
- SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true );
+ m_parameter->SetParameter( m_labeledNBestList, "labeled-n-best-list", true );
// include word alignment in the n-best list
- SetBooleanParameter( &m_nBestIncludesSegmentation, "include-segmentation-in-n-best", false );
+ m_parameter->SetParameter( m_nBestIncludesSegmentation, "include-segmentation-in-n-best", false );
// printing source phrase spans
- SetBooleanParameter( &m_reportSegmentation, "report-segmentation", false );
- SetBooleanParameter( &m_reportSegmentationEnriched, "report-segmentation-enriched", false );
+ m_parameter->SetParameter( m_reportSegmentation, "report-segmentation", false );
+ m_parameter->SetParameter( m_reportSegmentationEnriched, "report-segmentation-enriched", false );
// print all factors of output translations
- SetBooleanParameter( &m_reportAllFactors, "report-all-factors", false );
+ m_parameter->SetParameter( m_reportAllFactors, "report-all-factors", false );
// print all factors of output translations
- SetBooleanParameter( &m_reportAllFactorsNBest, "report-all-factors-in-n-best", false );
+ m_parameter->SetParameter( m_reportAllFactorsNBest, "report-all-factors-in-n-best", false );
//input factors
- const vector<string> &inputFactorVector = m_parameter->GetParam("input-factors");
- for(size_t i=0; i<inputFactorVector.size(); i++) {
- m_inputFactorOrder.push_back(Scan<FactorType>(inputFactorVector[i]));
+ params = m_parameter->GetParam("input-factors");
+ if (params) {
+ m_inputFactorOrder = Scan<FactorType>(*params);
}
if(m_inputFactorOrder.empty()) {
- UserMessage::Add(string("no input factor specified in config file"));
- return false;
+ m_inputFactorOrder.push_back(0);
}
//output factors
- const vector<string> &outputFactorVector = m_parameter->GetParam("output-factors");
- for(size_t i=0; i<outputFactorVector.size(); i++) {
- m_outputFactorOrder.push_back(Scan<FactorType>(outputFactorVector[i]));
+ params = m_parameter->GetParam("output-factors");
+ if (params) {
+ m_outputFactorOrder = Scan<FactorType>(*params);
}
if(m_outputFactorOrder.empty()) {
// default. output factor 0
@@ -297,60 +298,35 @@ bool StaticData::LoadData(Parameter *parameter)
}
//source word deletion
- SetBooleanParameter( &m_wordDeletionEnabled, "phrase-drop-allowed", false );
+ m_parameter->SetParameter(m_wordDeletionEnabled, "phrase-drop-allowed", false );
//Disable discarding
- SetBooleanParameter(&m_disableDiscarding, "disable-discarding", false);
+ m_parameter->SetParameter(m_disableDiscarding, "disable-discarding", false);
//Print Translation Options
- SetBooleanParameter( &m_printTranslationOptions, "print-translation-option", false );
+ m_parameter->SetParameter(m_printTranslationOptions, "print-translation-option", false );
//Print All Derivations
- SetBooleanParameter( &m_printAllDerivations , "print-all-derivations", false );
+ m_parameter->SetParameter(m_printAllDerivations , "print-all-derivations", false );
// additional output
- if (m_parameter->isParamSpecified("translation-details")) {
- const vector<string> &args = m_parameter->GetParam("translation-details");
- if (args.size() == 1) {
- m_detailedTranslationReportingFilePath = args[0];
- } else {
- UserMessage::Add(string("the translation-details option requires exactly one filename argument"));
- return false;
- }
- }
- if (m_parameter->isParamSpecified("tree-translation-details")) {
- const vector<string> &args = m_parameter->GetParam("tree-translation-details");
- if (args.size() == 1) {
- m_detailedTreeFragmentsTranslationReportingFilePath = args[0];
- } else {
- UserMessage::Add(string("the tree-translation-details option requires exactly one filename argument"));
- return false;
- }
- }
+ m_parameter->SetParameter<string>(m_detailedTranslationReportingFilePath, "translation-details", "");
+ m_parameter->SetParameter<string>(m_detailedTreeFragmentsTranslationReportingFilePath, "tree-translation-details", "");
//DIMw
- if (m_parameter->isParamSpecified("translation-all-details")) {
- const vector<string> &args = m_parameter->GetParam("translation-all-details");
- if (args.size() == 1) {
- m_detailedAllTranslationReportingFilePath = args[0];
- } else {
- UserMessage::Add(string("the translation-all-details option requires exactly one filename argument"));
- return false;
- }
- }
+ m_parameter->SetParameter<string>(m_detailedAllTranslationReportingFilePath, "translation-all-details", "");
// reordering constraints
- m_maxDistortion = (m_parameter->GetParam("distortion-limit").size() > 0) ?
- Scan<int>(m_parameter->GetParam("distortion-limit")[0])
- : -1;
- SetBooleanParameter( &m_reorderingConstraint, "monotone-at-punctuation", false );
+ m_parameter->SetParameter(m_maxDistortion, "distortion-limit", -1);
+
+ m_parameter->SetParameter(m_reorderingConstraint, "monotone-at-punctuation", false );
// settings for pruning
- m_maxHypoStackSize = (m_parameter->GetParam("stack").size() > 0)
- ? Scan<size_t>(m_parameter->GetParam("stack")[0]) : DEFAULT_MAX_HYPOSTACK_SIZE;
+ m_parameter->SetParameter(m_maxHypoStackSize, "stack", DEFAULT_MAX_HYPOSTACK_SIZE);
m_minHypoStackDiversity = 0;
- if (m_parameter->GetParam("stack-diversity").size() > 0) {
+ params = m_parameter->GetParam("stack-diversity");
+ if (params && params->size()) {
if (m_maxDistortion > 15) {
UserMessage::Add("stack diversity > 0 is not allowed for distortion limits larger than 15");
return false;
@@ -359,101 +335,92 @@ bool StaticData::LoadData(Parameter *parameter)
UserMessage::Add("stack diversity > 0 is not allowed for lattice input");
return false;
}
- m_minHypoStackDiversity = Scan<size_t>(m_parameter->GetParam("stack-diversity")[0]);
+ m_minHypoStackDiversity = Scan<size_t>(params->at(0));
}
- m_beamWidth = (m_parameter->GetParam("beam-threshold").size() > 0) ?
- TransformScore(Scan<float>(m_parameter->GetParam("beam-threshold")[0]))
- : TransformScore(DEFAULT_BEAM_WIDTH);
- m_earlyDiscardingThreshold = (m_parameter->GetParam("early-discarding-threshold").size() > 0) ?
- TransformScore(Scan<float>(m_parameter->GetParam("early-discarding-threshold")[0]))
- : TransformScore(DEFAULT_EARLY_DISCARDING_THRESHOLD);
- m_translationOptionThreshold = (m_parameter->GetParam("translation-option-threshold").size() > 0) ?
- TransformScore(Scan<float>(m_parameter->GetParam("translation-option-threshold")[0]))
- : TransformScore(DEFAULT_TRANSLATION_OPTION_THRESHOLD);
-
- m_maxNoTransOptPerCoverage = (m_parameter->GetParam("max-trans-opt-per-coverage").size() > 0)
- ? Scan<size_t>(m_parameter->GetParam("max-trans-opt-per-coverage")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
+ m_parameter->SetParameter(m_beamWidth, "beam-threshold", DEFAULT_BEAM_WIDTH);
+ m_beamWidth = TransformScore(m_beamWidth);
- m_maxNoPartTransOpt = (m_parameter->GetParam("max-partial-trans-opt").size() > 0)
- ? Scan<size_t>(m_parameter->GetParam("max-partial-trans-opt")[0]) : DEFAULT_MAX_PART_TRANS_OPT_SIZE;
+ m_parameter->SetParameter(m_earlyDiscardingThreshold, "early-discarding-threshold", DEFAULT_EARLY_DISCARDING_THRESHOLD);
+ m_earlyDiscardingThreshold = TransformScore(m_earlyDiscardingThreshold);
- m_maxPhraseLength = (m_parameter->GetParam("max-phrase-length").size() > 0)
- ? Scan<size_t>(m_parameter->GetParam("max-phrase-length")[0]) : DEFAULT_MAX_PHRASE_LENGTH;
+ m_parameter->SetParameter(m_translationOptionThreshold, "translation-option-threshold", DEFAULT_TRANSLATION_OPTION_THRESHOLD);
+ m_translationOptionThreshold = TransformScore(m_translationOptionThreshold);
- m_cubePruningPopLimit = (m_parameter->GetParam("cube-pruning-pop-limit").size() > 0)
- ? Scan<size_t>(m_parameter->GetParam("cube-pruning-pop-limit")[0]) : DEFAULT_CUBE_PRUNING_POP_LIMIT;
+ m_parameter->SetParameter(m_maxNoTransOptPerCoverage, "max-trans-opt-per-coverage", DEFAULT_MAX_TRANS_OPT_SIZE);
+ m_parameter->SetParameter(m_maxNoPartTransOpt, "max-partial-trans-opt", DEFAULT_MAX_PART_TRANS_OPT_SIZE);
+ m_parameter->SetParameter(m_maxPhraseLength, "max-phrase-length", DEFAULT_MAX_PHRASE_LENGTH);
+ m_parameter->SetParameter(m_cubePruningPopLimit, "cube-pruning-pop-limit", DEFAULT_CUBE_PRUNING_POP_LIMIT);
+ m_parameter->SetParameter(m_cubePruningDiversity, "cube-pruning-diversity", DEFAULT_CUBE_PRUNING_DIVERSITY);
- m_cubePruningDiversity = (m_parameter->GetParam("cube-pruning-diversity").size() > 0)
- ? Scan<size_t>(m_parameter->GetParam("cube-pruning-diversity")[0]) : DEFAULT_CUBE_PRUNING_DIVERSITY;
-
- SetBooleanParameter(&m_cubePruningLazyScoring, "cube-pruning-lazy-scoring", false);
+ m_parameter->SetParameter(m_cubePruningLazyScoring, "cube-pruning-lazy-scoring", false);
// early distortion cost
- SetBooleanParameter( &m_useEarlyDistortionCost, "early-distortion-cost", false );
+ m_parameter->SetParameter(m_useEarlyDistortionCost, "early-distortion-cost", false );
// unknown word processing
- SetBooleanParameter( &m_dropUnknown, "drop-unknown", false );
- SetBooleanParameter( &m_markUnknown, "mark-unknown", false );
-
- SetBooleanParameter( &m_lmEnableOOVFeature, "lmodel-oov-feature", false);
+ m_parameter->SetParameter(m_dropUnknown, "drop-unknown", false );
+ m_parameter->SetParameter(m_markUnknown, "mark-unknown", false );
- SetBooleanParameter( &m_adjacentOnly, "adjacent-only", false);
+ m_parameter->SetParameter(m_lmEnableOOVFeature, "lmodel-oov-feature", false);
// minimum Bayes risk decoding
- SetBooleanParameter( &m_mbr, "minimum-bayes-risk", false );
- m_mbrSize = (m_parameter->GetParam("mbr-size").size() > 0) ?
- Scan<size_t>(m_parameter->GetParam("mbr-size")[0]) : 200;
- m_mbrScale = (m_parameter->GetParam("mbr-scale").size() > 0) ?
- Scan<float>(m_parameter->GetParam("mbr-scale")[0]) : 1.0f;
+ m_parameter->SetParameter(m_mbr, "minimum-bayes-risk", false );
+ m_parameter->SetParameter<size_t>(m_mbrSize, "mbr-size", 200);
+ m_parameter->SetParameter(m_mbrScale, "mbr-scale", 1.0f);
//lattice mbr
- SetBooleanParameter( &m_useLatticeMBR, "lminimum-bayes-risk", false );
+ m_parameter->SetParameter(m_useLatticeMBR, "lminimum-bayes-risk", false );
if (m_useLatticeMBR && m_mbr) {
cerr << "Error: Cannot use both n-best mbr and lattice mbr together" << endl;
exit(1);
}
//mira training
- SetBooleanParameter( &m_mira, "mira", false );
+ m_parameter->SetParameter(m_mira, "mira", false );
// lattice MBR
if (m_useLatticeMBR) m_mbr = true;
- m_lmbrPruning = (m_parameter->GetParam("lmbr-pruning-factor").size() > 0) ?
- Scan<size_t>(m_parameter->GetParam("lmbr-pruning-factor")[0]) : 30;
- m_lmbrThetas = Scan<float>(m_parameter->GetParam("lmbr-thetas"));
- SetBooleanParameter( &m_useLatticeHypSetForLatticeMBR, "lattice-hypo-set", false );
- m_lmbrPrecision = (m_parameter->GetParam("lmbr-p").size() > 0) ?
- Scan<float>(m_parameter->GetParam("lmbr-p")[0]) : 0.8f;
- m_lmbrPRatio = (m_parameter->GetParam("lmbr-r").size() > 0) ?
- Scan<float>(m_parameter->GetParam("lmbr-r")[0]) : 0.6f;
- m_lmbrMapWeight = (m_parameter->GetParam("lmbr-map-weight").size() >0) ?
- Scan<float>(m_parameter->GetParam("lmbr-map-weight")[0]) : 0.0f;
+ m_parameter->SetParameter<size_t>(m_lmbrPruning, "lmbr-pruning-factor", 30);
+ m_parameter->SetParameter(m_lmbrPrecision, "lmbr-p", 0.8f);
+ m_parameter->SetParameter(m_lmbrPRatio, "lmbr-r", 0.6f);
+ m_parameter->SetParameter(m_lmbrMapWeight, "lmbr-map-weight", 0.0f);
+ m_parameter->SetParameter(m_useLatticeHypSetForLatticeMBR, "lattice-hypo-set", false );
+
+ params = m_parameter->GetParam("lmbr-thetas");
+ if (params) {
+ m_lmbrThetas = Scan<float>(*params);
+ }
//consensus decoding
- SetBooleanParameter( &m_useConsensusDecoding, "consensus-decoding", false );
+ m_parameter->SetParameter(m_useConsensusDecoding, "consensus-decoding", false );
if (m_useConsensusDecoding && m_mbr) {
cerr<< "Error: Cannot use consensus decoding together with mbr" << endl;
exit(1);
}
if (m_useConsensusDecoding) m_mbr=true;
+ m_parameter->SetParameter(m_defaultNonTermOnlyForEmptyRange, "default-non-term-for-empty-range-only", false );
+ m_parameter->SetParameter(m_printNBestTrees, "n-best-trees", false );
+
+ // S2T decoder
+ m_parameter->SetParameter(m_useS2TDecoder, "s2t", false );
+ m_parameter->SetParameter(m_s2tParsingAlgorithm, "s2t-parsing-algorithm", RecursiveCYKPlus);
+
// Compact phrase table and reordering model
- SetBooleanParameter( &m_minphrMemory, "minphr-memory", false );
- SetBooleanParameter( &m_minlexrMemory, "minlexr-memory", false );
+ m_parameter->SetParameter(m_minphrMemory, "minphr-memory", false );
+ m_parameter->SetParameter(m_minlexrMemory, "minlexr-memory", false );
- m_timeout_threshold = (m_parameter->GetParam("time-out").size() > 0) ?
- Scan<size_t>(m_parameter->GetParam("time-out")[0]) : -1;
+ m_parameter->SetParameter<size_t>(m_timeout_threshold, "time-out", -1);
m_timeout = (GetTimeoutThreshold() == (size_t)-1) ? false : true;
- m_lmcache_cleanup_threshold = (m_parameter->GetParam("clean-lm-cache").size() > 0) ?
- Scan<size_t>(m_parameter->GetParam("clean-lm-cache")[0]) : 1;
+ m_parameter->SetParameter<size_t>(m_lmcache_cleanup_threshold, "clean-lm-cache", 1);
m_threadCount = 1;
- const std::vector<std::string> &threadInfo = m_parameter->GetParam("threads");
- if (!threadInfo.empty()) {
- if (threadInfo[0] == "all") {
+ params = m_parameter->GetParam("threads");
+ if (params && params->size()) {
+ if (params->at(0) == "all") {
#ifdef WITH_THREADS
m_threadCount = boost::thread::hardware_concurrency();
if (!m_threadCount) {
@@ -465,62 +432,50 @@ bool StaticData::LoadData(Parameter *parameter)
return false;
#endif
} else {
- m_threadCount = Scan<int>(threadInfo[0]);
+ m_threadCount = Scan<int>(params->at(0));
if (m_threadCount < 1) {
UserMessage::Add("Specify at least one thread.");
return false;
}
#ifndef WITH_THREADS
if (m_threadCount > 1) {
- UserMessage::Add(std::string("Error: Thread count of ") + threadInfo[0] + " but moses not built with thread support");
+ UserMessage::Add(std::string("Error: Thread count of ") + params->at(0) + " but moses not built with thread support");
return false;
}
#endif
}
}
- m_startTranslationId = (m_parameter->GetParam("start-translation-id").size() > 0) ?
- Scan<long>(m_parameter->GetParam("start-translation-id")[0]) : 0;
+ m_parameter->SetParameter<long>(m_startTranslationId, "start-translation-id", 0);
// use of xml in input
- if (m_parameter->GetParam("xml-input").size() == 0) m_xmlInputType = XmlPassThrough;
- else if (m_parameter->GetParam("xml-input")[0]=="exclusive") m_xmlInputType = XmlExclusive;
- else if (m_parameter->GetParam("xml-input")[0]=="inclusive") m_xmlInputType = XmlInclusive;
- else if (m_parameter->GetParam("xml-input")[0]=="constraint") m_xmlInputType = XmlConstraint;
- else if (m_parameter->GetParam("xml-input")[0]=="ignore") m_xmlInputType = XmlIgnore;
- else if (m_parameter->GetParam("xml-input")[0]=="pass-through") m_xmlInputType = XmlPassThrough;
- else {
- UserMessage::Add("invalid xml-input value, must be pass-through, exclusive, inclusive, constraint, or ignore");
- return false;
- }
+ m_parameter->SetParameter<XmlInputType>(m_xmlInputType, "xml-input", XmlPassThrough);
// specify XML tags opening and closing brackets for XML option
- if (m_parameter->GetParam("xml-brackets").size() > 0) {
- std::vector<std::string> brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]);
+ params = m_parameter->GetParam("xml-brackets");
+ if (params && params->size()) {
+ std::vector<std::string> brackets = Tokenize(params->at(0));
if(brackets.size()!=2) {
cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl;
exit(1);
}
m_xmlBrackets.first= brackets[0];
m_xmlBrackets.second=brackets[1];
- cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl;
+ VERBOSE(1,"XML tags opening and closing brackets for XML input are: "
+ << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl);
}
- if (m_parameter->GetParam("placeholder-factor").size() > 0) {
- m_placeHolderFactor = Scan<FactorType>(m_parameter->GetParam("placeholder-factor")[0]);
- } else {
- m_placeHolderFactor = NOT_FOUND;
- }
+ m_parameter->SetParameter(m_placeHolderFactor, "placeholder-factor", NOT_FOUND);
std::map<std::string, std::string> featureNameOverride = OverrideFeatureNames();
// all features
map<string, int> featureIndexMap;
- const vector<string> &features = m_parameter->GetParam("feature");
- for (size_t i = 0; i < features.size(); ++i) {
- const string &line = Trim(features[i]);
- cerr << "line=" << line << endl;
+ params = m_parameter->GetParam("feature");
+ for (size_t i = 0; params && i < params->size(); ++i) {
+ const string &line = Trim(params->at(i));
+ VERBOSE(1,"line=" << line << endl);
if (line.empty())
continue;
@@ -543,21 +498,9 @@ bool StaticData::LoadData(Parameter *parameter)
NoCache();
OverrideFeatures();
- std::cerr <<"After StaticData::LoadDataStatic" << std::endl;
-
- /*
- std::cerr <<"Before ShowWeights" << std::endl;
- // setting "-show-weights" -> just dump out weights and exit
- if (m_parameter->isParamSpecified("show-weights")) {
- MosesCmd::ShowWeights();
- exit(0);
- }
- std::cerr <<"After ShowWeights" << std::endl;
- */
-
- std::cerr <<"Before LoadFeatureFunctions" << std::endl;
- LoadFeatureFunctions();
- std::cerr <<"After LoadFeatureFunctions" << std::endl;
+ if (m_parameter->GetParam("show-weights") == NULL) {
+ LoadFeatureFunctions();
+ }
if (!LoadDecodeGraphs()) return false;
@@ -569,15 +512,12 @@ bool StaticData::LoadData(Parameter *parameter)
//Add any other features here.
//Load extra feature weights
- vector<string> extraWeightConfig = m_parameter->GetParam("weight-file");
- if (extraWeightConfig.size()) {
- if (extraWeightConfig.size() != 1) {
- UserMessage::Add("One argument should be supplied for weight-file");
- return false;
- }
+ string weightFile;
+ m_parameter->SetParameter<string>(weightFile, "weight-file", "");
+ if (!weightFile.empty()) {
ScoreComponentCollection extraWeights;
- if (!extraWeights.Load(extraWeightConfig[0])) {
- UserMessage::Add("Unable to load weights from " + extraWeightConfig[0]);
+ if (!extraWeights.Load(weightFile)) {
+ UserMessage::Add("Unable to load weights from " + weightFile);
return false;
}
m_allWeights.PlusEquals(extraWeights);
@@ -587,7 +527,8 @@ bool StaticData::LoadData(Parameter *parameter)
LoadSparseWeightsFromConfig();
// alternate weight settings
- if (m_parameter->GetParam("alternate-weight-setting").size() > 0) {
+ params = m_parameter->GetParam("alternate-weight-setting");
+ if (params && params->size()) {
if (!LoadAlternateWeightSettings()) {
return false;
}
@@ -595,25 +536,6 @@ bool StaticData::LoadData(Parameter *parameter)
return true;
}
-void StaticData::SetBooleanParameter( bool *parameter, string parameterName, bool defaultValue )
-{
- // default value if nothing is specified
- *parameter = defaultValue;
- if (! m_parameter->isParamSpecified( parameterName ) ) {
- return;
- }
-
- // if parameter is just specified as, e.g. "-parameter" set it true
- if (m_parameter->GetParam( parameterName ).size() == 0) {
- *parameter = true;
- }
-
- // if paramter is specified "-parameter true" or "-parameter false"
- else if (m_parameter->GetParam( parameterName ).size() == 1) {
- *parameter = Scan<bool>( m_parameter->GetParam( parameterName )[0]);
- }
-}
-
void StaticData::SetWeight(const FeatureFunction* sp, float weight)
{
m_allWeights.Resize();
@@ -629,13 +551,7 @@ void StaticData::SetWeights(const FeatureFunction* sp, const std::vector<float>&
void StaticData::LoadNonTerminals()
{
string defaultNonTerminals;
-
- if (m_parameter->GetParam("non-terminals").size() == 0) {
- defaultNonTerminals = "X";
- } else {
- vector<std::string> tokens = Tokenize(m_parameter->GetParam("non-terminals")[0]);
- defaultNonTerminals = tokens[0];
- }
+ m_parameter->SetParameter<string>(defaultNonTerminals, "non-terminals", "X");
FactorCollection &factorCollection = FactorCollection::Instance();
@@ -647,12 +563,13 @@ void StaticData::LoadNonTerminals()
const Factor *targetFactor = factorCollection.AddFactor(Output, 0, defaultNonTerminals, true);
m_outputDefaultNonTerminal.SetFactor(0, targetFactor);
- // for unknwon words
- if (m_parameter->GetParam("unknown-lhs").size() == 0) {
+ // for unknown words
+ const PARAM_VEC *params = m_parameter->GetParam("unknown-lhs");
+ if (params == NULL || params->size() == 0) {
UnknownLHSEntry entry(defaultNonTerminals, 0.0f);
m_unknownLHS.push_back(entry);
} else {
- const string &filePath = m_parameter->GetParam("unknown-lhs")[0];
+ const string &filePath = params->at(0);
InputFileStream inStream(filePath);
string line;
@@ -662,7 +579,8 @@ void StaticData::LoadNonTerminals()
"Incorrect unknown LHS format: " << line);
UnknownLHSEntry entry(tokens[0], Scan<float>(tokens[1]));
m_unknownLHS.push_back(entry);
- const Factor *targetFactor = factorCollection.AddFactor(Output, 0, tokens[0], true);
+ // const Factor *targetFactor =
+ factorCollection.AddFactor(Output, 0, tokens[0], true);
}
}
@@ -674,20 +592,28 @@ void StaticData::LoadChartDecodingParameters()
LoadNonTerminals();
// source label overlap
- if (m_parameter->GetParam("source-label-overlap").size() > 0) {
- m_sourceLabelOverlap = (SourceLabelOverlap) Scan<int>(m_parameter->GetParam("source-label-overlap")[0]);
- } else {
- m_sourceLabelOverlap = SourceLabelOverlapAdd;
- }
+ m_parameter->SetParameter(m_sourceLabelOverlap, "source-label-overlap", SourceLabelOverlapAdd);
+ m_parameter->SetParameter(m_ruleLimit, "rule-limit", DEFAULT_MAX_TRANS_OPT_SIZE);
- m_ruleLimit = (m_parameter->GetParam("rule-limit").size() > 0)
- ? Scan<size_t>(m_parameter->GetParam("rule-limit")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
}
bool StaticData::LoadDecodeGraphs()
{
- const vector<string> &mappingVector = m_parameter->GetParam("mapping");
- const vector<size_t> &maxChartSpans = Scan<size_t>(m_parameter->GetParam("max-chart-span"));
+ vector<string> mappingVector;
+ vector<size_t> maxChartSpans;
+
+ const PARAM_VEC *params;
+
+ params = m_parameter->GetParam("mapping");
+ if (params && params->size()) {
+ mappingVector = *params;
+ }
+
+ params = m_parameter->GetParam("max-chart-span");
+ if (params && params->size()) {
+ maxChartSpans = Scan<size_t>(*params);
+ }
+
const vector<PhraseDictionary*>& pts = PhraseDictionary::GetColl();
const vector<GenerationDictionary*>& gens = GenerationDictionary::GetColl();
@@ -756,7 +682,7 @@ bool StaticData::LoadDecodeGraphs()
DecodeGraph *decodeGraph;
if (IsChart()) {
size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
- cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl;
+ VERBOSE(1,"max-chart-span: " << maxChartSpans[decodeGraphInd] << endl);
decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
} else {
decodeGraph = new DecodeGraph(m_decodeGraphs.size());
@@ -773,13 +699,13 @@ bool StaticData::LoadDecodeGraphs()
// set maximum n-gram size for backoff approach to decoding paths
// default is always use subsequent paths (value = 0)
// if specified, record maxmimum unseen n-gram size
- const vector<string> &backoffVector = m_parameter->GetParam("decoding-graph-backoff");
- for(size_t i=0; i<m_decodeGraphs.size() && i<backoffVector.size(); i++) {
- DecodeGraph &decodeGraph = *m_decodeGraphs[i];
+ const vector<string> *backoffVector = m_parameter->GetParam("decoding-graph-backoff");
+ for(size_t i=0; i<m_decodeGraphs.size() && backoffVector && i<backoffVector->size(); i++) {
+ DecodeGraph &decodeGraph = *m_decodeGraphs[i];
- if (i < backoffVector.size()) {
- decodeGraph.SetBackoff(Scan<size_t>(backoffVector[i]));
- }
+ if (i < backoffVector->size()) {
+ decodeGraph.SetBackoff(Scan<size_t>(backoffVector->at(i)));
+ }
}
return true;
@@ -888,7 +814,7 @@ void StaticData::SetExecPath(const std::string &path)
if (pos != string::npos) {
m_binPath = path.substr(0, pos);
}
- cerr << m_binPath << endl;
+ VERBOSE(1,m_binPath << endl);
}
const string &StaticData::GetBinDirectory() const
@@ -941,7 +867,8 @@ void StaticData::LoadFeatureFunctions()
FeatureFunction *ff = *iter;
bool doLoad = true;
- if (PhraseDictionary *ffCast = dynamic_cast<PhraseDictionary*>(ff)) {
+ // if (PhraseDictionary *ffCast = dynamic_cast<PhraseDictionary*>(ff)) {
+ if (dynamic_cast<PhraseDictionary*>(ff)) {
doLoad = false;
}
@@ -985,7 +912,7 @@ bool StaticData::CheckWeights() const
set<string>::iterator iter;
for (iter = weightNames.begin(); iter != weightNames.end(); ) {
string fname = (*iter).substr(0, (*iter).find("_"));
- cerr << fname << "\n";
+ VERBOSE(1,fname << "\n");
if (featureNames.find(fname) != featureNames.end()) {
weightNames.erase(iter++);
} else {
@@ -1038,7 +965,11 @@ bool StaticData::LoadAlternateWeightSettings()
return false;
}
- const vector<string> &weightSpecification = m_parameter->GetParam("alternate-weight-setting");
+ vector<string> weightSpecification;
+ const PARAM_VEC *params = m_parameter->GetParam("alternate-weight-setting");
+ if (params && params->size()) {
+ weightSpecification = *params;
+ }
// get mapping from feature names to feature functions
map<string,FeatureFunction*> nameToFF;
@@ -1060,7 +991,7 @@ bool StaticData::LoadAlternateWeightSettings()
vector<string> tokens = Tokenize(weightSpecification[i]);
vector<string> args = Tokenize(tokens[0], "=");
currentId = args[1];
- cerr << "alternate weight setting " << currentId << endl;
+ VERBOSE(1,"alternate weight setting " << currentId << endl);
UTIL_THROW_IF2(m_weightSetting.find(currentId) != m_weightSetting.end(),
"Duplicate alternate weight id: " << currentId);
m_weightSetting[ currentId ] = new ScoreComponentCollection;
@@ -1137,7 +1068,7 @@ bool StaticData::LoadAlternateWeightSettings()
void StaticData::NoCache()
{
bool noCache;
- SetBooleanParameter( &noCache, "no-cache", false );
+ m_parameter->SetParameter(noCache, "no-cache", false );
if (noCache) {
const std::vector<PhraseDictionary*> &pts = PhraseDictionary::GetColl();
@@ -1152,10 +1083,10 @@ std::map<std::string, std::string> StaticData::OverrideFeatureNames()
{
std::map<std::string, std::string> ret;
- const PARAM_VEC &params = m_parameter->GetParam("feature-name-overwrite");
- if (params.size()) {
- UTIL_THROW_IF2(params.size() != 1, "Only provide 1 line in the section [feature-name-overwrite]");
- vector<string> toks = Tokenize(params[0]);
+ const PARAM_VEC *params = m_parameter->GetParam("feature-name-overwrite");
+ if (params && params->size()) {
+ UTIL_THROW_IF2(params->size() != 1, "Only provide 1 line in the section [feature-name-overwrite]");
+ vector<string> toks = Tokenize(params->at(0));
UTIL_THROW_IF2(toks.size() % 2 != 0, "Format of -feature-name-overwrite must be [old-name new-name]*");
for (size_t i = 0; i < toks.size(); i += 2) {
@@ -1165,14 +1096,23 @@ std::map<std::string, std::string> StaticData::OverrideFeatureNames()
}
}
+ if (m_useS2TDecoder) {
+ // Automatically override PhraseDictionary{Memory,Scope3}. This will
+ // have to change if the FF parameters diverge too much in the future,
+ // but for now it makes switching between the old and new decoders much
+ // more convenient.
+ ret["PhraseDictionaryMemory"] = "RuleTable";
+ ret["PhraseDictionaryScope3"] = "RuleTable";
+ }
+
return ret;
}
void StaticData::OverrideFeatures()
{
- const PARAM_VEC &params = m_parameter->GetParam("feature-overwrite");
- for (size_t i = 0; i < params.size(); ++i) {
- const string &str = params[i];
+ const PARAM_VEC *params = m_parameter->GetParam("feature-overwrite");
+ for (size_t i = 0; params && i < params->size(); ++i) {
+ const string &str = params->at(i);
vector<string> toks = Tokenize(str);
UTIL_THROW_IF2(toks.size() <= 1, "Incorrect format for feature override: " << str);
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 96b1972e2..8d399bfb2 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -206,7 +206,10 @@ protected:
FactorType m_placeHolderFactor;
bool m_useLegacyPT;
- bool m_adjacentOnly;
+ bool m_defaultNonTermOnlyForEmptyRange;
+ bool m_useS2TDecoder;
+ S2TParsingAlgorithm m_s2tParsingAlgorithm;
+ bool m_printNBestTrees;
FeatureRegistry m_registry;
PhrasePropertyFactory m_phrasePropertyFactory;
@@ -216,9 +219,6 @@ protected:
void LoadChartDecodingParameters();
void LoadNonTerminals();
- //! helper fn to set bool param from ini file/command line
- void SetBooleanParameter(bool *paramter, std::string parameterName, bool defaultValue);
-
//! load decoding steps
bool LoadDecodeGraphs();
@@ -269,8 +269,8 @@ public:
bool LoadData(Parameter *parameter);
void ClearData();
- const PARAM_VEC &GetParam(const std::string &paramName) const {
- return m_parameter->GetParam(paramName);
+ const Parameter &GetParameter() const {
+ return *m_parameter;
}
const std::vector<FactorType> &GetInputFactorOrder() const {
@@ -399,10 +399,6 @@ public:
return m_minlexrMemory;
}
- const std::vector<std::string> &GetDescription() const {
- return m_parameter->GetParam("description");
- }
-
// for mert
size_t GetNBestSize() const {
return m_nBestSize;
@@ -465,10 +461,6 @@ public:
return m_allWeights.GetScoresForProducer(sp);
}
- float GetSparseWeight(const FName& featureName) const {
- return m_allWeights.GetSparseWeight(featureName);
- }
-
//Weights for feature with fixed number of values
void SetWeights(const FeatureFunction* sp, const std::vector<float>& weights);
@@ -778,15 +770,8 @@ public:
return m_softMatchesMap;
}
-
- bool AdjacentOnly() const {
- return m_adjacentOnly;
- }
-
-
void ResetWeights(const std::string &denseWeights, const std::string &sparseFile);
-
// need global access for output of tree structure
const StatefulFeatureFunction* GetTreeStructure() const {
return m_treeStructure;
@@ -796,6 +781,20 @@ public:
m_treeStructure = treeStructure;
}
+ bool GetDefaultNonTermOnlyForEmptyRange() const
+ { return m_defaultNonTermOnlyForEmptyRange; }
+
+ bool UseS2TDecoder() const {
+ return m_useS2TDecoder;
+ }
+ S2TParsingAlgorithm GetS2TParsingAlgorithm() const {
+ return m_s2tParsingAlgorithm;
+ }
+
+ bool PrintNBestTrees() const {
+ return m_printNBestTrees;
+ }
+
};
}
diff --git a/moses/SyntacticLanguageModel.h b/moses/SyntacticLanguageModel.h
index 6e88d85c1..76882a4d1 100644
--- a/moses/SyntacticLanguageModel.h
+++ b/moses/SyntacticLanguageModel.h
@@ -30,7 +30,7 @@ public:
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
+ FFState* EvaluateWhenApplied(const ChartHypothesis& cur_hypo,
int featureID,
ScoreComponentCollection* accumulator) const {
throw std::runtime_error("Syntactic LM can only be used with phrase-based decoder.");
diff --git a/moses/Syntax/BoundedPriorityContainer.h b/moses/Syntax/BoundedPriorityContainer.h
new file mode 100644
index 000000000..9afc1b75d
--- /dev/null
+++ b/moses/Syntax/BoundedPriorityContainer.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <queue>
+#include <vector>
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// A container that can hold up to k objects of type T, each with an associated
+// priority. The container accepts new elements unconditionally until the
+// limit is reached. After that, elements are only accepted if they have a
+// higher priority than the worst element (which they displace).
+//
+// BoundedPriorityContainer does not preserve the insertion order of the
+// elements (or provide any other guarantees about order).
+//
+// BoundedPriorityContainer pre-allocates space for all k objects.
+//
+// (Although BoundedPriorityContainer is implemented using a priority queue,
+// it doesn't provide the interface of a priority queue, hence the generic
+// name 'container'.)
+template<typename T>
+class BoundedPriorityContainer
+{
+ public:
+ typedef typename std::vector<T>::iterator Iterator;
+ typedef typename std::vector<T>::const_iterator ConstIterator;
+
+ BoundedPriorityContainer(std::size_t);
+
+ Iterator Begin() { return m_elements.begin(); }
+ Iterator End() { return m_elements.begin()+m_size; }
+
+ ConstIterator Begin() const { return m_elements.begin(); }
+ ConstIterator End() const { return m_elements.begin()+m_size; }
+
+ // Return the number of elements currently held.
+ std::size_t Size() const { return m_size; }
+
+ // 'Lazily' clear the container by setting the size to 0 (allowing elements
+ // to be overwritten).
+ // TODO Eliminate heap-reorganisation overhead by using a vector-based heap
+ // TODO directly instead of priority_queue, which requires pop() to clear
+ // TODO Alternative, is to clear m_queue by assigning an empty queue value
+ // TODO but that might incur an alloc-related overhead when the new underlying
+ // TODO has to be regrown.
+ void LazyClear() { m_size = 0; while (!m_queue.empty()) { m_queue.pop(); } }
+
+ // Insert the given object iff
+ // i) the container is not full yet, or
+ // ii) the new object has a higher priority than the worst one already
+ // stored.
+ // The return value specifies whether or not the element was inserted.
+ bool Insert(const T &, float);
+
+ // Insert the given object iff
+ // i) the container is not full yet, or
+ // ii) the new object has a higher priority than the worst one already
+ // stored.
+ // If the element is inserted then, for efficiency reasons, it is swapped in
+ // rather than copied. This requires that T provides a swap() function. The
+ // return value specifies whether or not the element was inserted.
+ // TODO Test if this is actually any faster than Insert() in practice.
+ bool SwapIn(T &, float);
+
+ // Determine if an object with the given priority would be accepted for
+ // insertion based on the current contents of the container.
+ bool WouldAccept(float priority)
+ {
+ return m_size < m_limit || priority > m_queue.top().first;
+ }
+
+ private:
+ typedef std::pair<float, int> PriorityIndexPair;
+
+ class PriorityIndexPairOrderer
+ {
+ public:
+ bool operator()(const PriorityIndexPair &p,
+ const PriorityIndexPair &q) const {
+ return p.first > q.first;
+ }
+ };
+
+ // Min-priority queue. The queue stores the indices of the elements, not
+ // the elements themselves to keep down the costs of heap maintenance.
+ typedef std::priority_queue<PriorityIndexPair,
+ std::vector<PriorityIndexPair>,
+ PriorityIndexPairOrderer> Queue;
+
+ // The elements are stored in a vector. Note that the size of this vector
+ // can be greater than m_size (after a call to LazyClear).
+ std::vector<T> m_elements;
+
+ // The number of elements currently held.
+ std::size_t m_size;
+
+ // The maximum number of elements.
+ const std::size_t m_limit;
+
+ // The min-priority queue.
+ Queue m_queue;
+};
+
+template<typename T>
+BoundedPriorityContainer<T>::BoundedPriorityContainer(std::size_t limit)
+ : m_size(0)
+ , m_limit(limit)
+{
+ m_elements.reserve(m_limit);
+}
+
+template<typename T>
+bool BoundedPriorityContainer<T>::Insert(const T &t, float priority)
+{
+ if (m_size < m_limit) {
+ PriorityIndexPair pair(priority, m_size);
+ m_queue.push(pair);
+ if (m_size < m_elements.size()) {
+ m_elements[m_size] = t;
+ } else {
+ m_elements.push_back(t);
+ }
+ ++m_size;
+ return true;
+ } else if (priority > m_queue.top().first) {
+ PriorityIndexPair pair = m_queue.top();
+ m_queue.pop();
+ pair.first = priority;
+ m_elements[pair.second] = t;
+ m_queue.push(pair);
+ return true;
+ }
+ return false;
+}
+
+template<typename T>
+bool BoundedPriorityContainer<T>::SwapIn(T &t, float priority)
+{
+ if (m_size < m_limit) {
+ PriorityIndexPair pair(priority, m_size);
+ m_queue.push(pair);
+ if (m_size < m_elements.size()) {
+ swap(m_elements[m_size], t);
+ } else {
+ m_elements.push_back(t);
+ }
+ ++m_size;
+ return true;
+ } else if (priority > m_queue.top().first) {
+ PriorityIndexPair pair = m_queue.top();
+ m_queue.pop();
+ pair.first = priority;
+ swap(m_elements[pair.second], t);
+ m_queue.push(pair);
+ return true;
+ }
+ return false;
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/Cube.cpp b/moses/Syntax/Cube.cpp
new file mode 100644
index 000000000..4fcf50829
--- /dev/null
+++ b/moses/Syntax/Cube.cpp
@@ -0,0 +1,138 @@
+#include "Cube.h"
+
+#include "moses/FF/FFState.h"
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/StaticData.h"
+
+#include "SVertex.h"
+
+namespace Moses
+{
+
+namespace Syntax
+{
+
+Cube::Cube(const SHyperedgeBundle &bundle)
+ : m_bundle(bundle)
+{
+ // Create the SHyperedge for the 'corner' of the cube.
+ std::vector<int> coordinates(bundle.stacks.size()+1, 0);
+ SHyperedge *hyperedge = CreateHyperedge(coordinates);
+ // Add its coordinates to the set of visited coordinates.
+ std::pair<CoordinateSet::iterator, bool> p = m_visited.insert(coordinates);
+ const std::vector<int> &storedCoordinates = *p.first;
+ // Add the SHyperedge to the queue along with its coordinates (which will be
+ // needed for creating its neighbours).
+ m_queue.push(QueueItem(hyperedge, &storedCoordinates));
+}
+
+Cube::~Cube()
+{
+ // Delete the SHyperedges belonging to any unpopped items. Note that the
+ // coordinate vectors are not deleted here since they are owned by m_visited
+ // (and so will be deleted by its destructor).
+ while (!m_queue.empty()) {
+ QueueItem item = m_queue.top();
+ m_queue.pop();
+ // Delete hyperedge and its head (head deletes hyperedge).
+ delete item.first->head; // TODO shared ownership of head vertex?
+ }
+}
+
+SHyperedge *Cube::Pop()
+{
+ QueueItem item = m_queue.top();
+ m_queue.pop();
+ CreateNeighbours(*item.second);
+ return item.first;
+}
+
+void Cube::CreateNeighbours(const std::vector<int> &coordinates)
+{
+ // Create a copy of the origin coordinates that will be adjusted for
+ // each neighbour.
+ std::vector<int> tmpCoordinates(coordinates);
+
+ // Create each neighbour along the vertex stack dimensions.
+ for (std::size_t i = 0; i < coordinates.size()-1; ++i) {
+ int x = coordinates[i];
+ if (m_bundle.stacks[i]->size() > x+1) {
+ ++tmpCoordinates[i];
+ CreateNeighbour(tmpCoordinates);
+ --tmpCoordinates[i];
+ }
+ }
+ // Create the neighbour along the translation dimension.
+ int x = coordinates.back();
+ if (m_bundle.translations->GetSize() > x+1) {
+ ++tmpCoordinates.back();
+ CreateNeighbour(tmpCoordinates);
+ --tmpCoordinates.back();
+ }
+}
+
+void Cube::CreateNeighbour(const std::vector<int> &coordinates)
+{
+ // Add the coordinates to the set of visited coordinates if not already
+ // present.
+ std::pair<CoordinateSet::iterator, bool> p = m_visited.insert(coordinates);
+ if (!p.second) {
+ // We have visited this neighbour before, so there is nothing to do.
+ return;
+ }
+ SHyperedge *hyperedge = CreateHyperedge(coordinates);
+ const std::vector<int> &storedCoordinates = *p.first;
+ m_queue.push(QueueItem(hyperedge, &storedCoordinates));
+}
+
+SHyperedge *Cube::CreateHyperedge(const std::vector<int> &coordinates)
+{
+ SHyperedge *hyperedge = new SHyperedge();
+
+ SVertex *head = new SVertex();
+ head->best = hyperedge;
+ head->pvertex = 0; // FIXME???
+ head->state.resize(
+ StatefulFeatureFunction::GetStatefulFeatureFunctions().size());
+ hyperedge->head = head;
+
+ hyperedge->tail.resize(coordinates.size()-1);
+ for (std::size_t i = 0; i < coordinates.size()-1; ++i) {
+ boost::shared_ptr<SVertex> pred = (*m_bundle.stacks[i])[coordinates[i]];
+ hyperedge->tail[i] = pred.get();
+ if (pred->best) {
+ hyperedge->scoreBreakdown.PlusEquals(pred->best->scoreBreakdown);
+ }
+ }
+ hyperedge->translation = *(m_bundle.translations->begin()+coordinates.back());
+ hyperedge->scoreBreakdown.PlusEquals(hyperedge->translation->GetScoreBreakdown());
+
+ const StaticData &staticData = StaticData::Instance();
+
+ // compute values of stateless feature functions that were not
+ // cached in the translation option-- there is no principled distinction
+ const std::vector<const StatelessFeatureFunction*>& sfs =
+ StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ for (unsigned i = 0; i < sfs.size(); ++i) {
+ if (!staticData.IsFeatureFunctionIgnored(*sfs[i])) {
+ sfs[i]->EvaluateWhenApplied(*hyperedge, &hyperedge->scoreBreakdown);
+ }
+ }
+
+ const std::vector<const StatefulFeatureFunction*>& ffs =
+ StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for (unsigned i = 0; i < ffs.size(); ++i) {
+ if (!staticData.IsFeatureFunctionIgnored(*ffs[i])) {
+ head->state[i] =
+ ffs[i]->EvaluateWhenApplied(*hyperedge, i, &hyperedge->scoreBreakdown);
+ }
+ }
+
+ hyperedge->score = hyperedge->scoreBreakdown.GetWeightedScore();
+
+ return hyperedge;
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/Cube.h b/moses/Syntax/Cube.h
new file mode 100644
index 000000000..a28440834
--- /dev/null
+++ b/moses/Syntax/Cube.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <queue>
+#include <vector>
+#include <utility>
+
+#include <boost/unordered_set.hpp>
+
+#include "SHyperedge.h"
+#include "SHyperedgeBundle.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// A cube -- in the cube pruning sense (see Chiang (2007)) -- that lazily
+// produces SHyperedge objects from a SHyperedgeBundle in approximately
+// best-first order.
+class Cube
+{
+ public:
+ Cube(const SHyperedgeBundle &);
+ ~Cube();
+
+ SHyperedge *Pop();
+
+ SHyperedge *Top() const { return m_queue.top().first; }
+
+ bool IsEmpty() const { return m_queue.empty(); }
+
+ private:
+ typedef boost::unordered_set<std::vector<int> > CoordinateSet;
+
+ typedef std::pair<SHyperedge *, const std::vector<int> *> QueueItem;
+
+ class QueueItemOrderer
+ {
+ public:
+ bool operator()(const QueueItem &p, const QueueItem &q) const {
+ return p.first->score < q.first->score;
+ }
+ };
+
+ typedef std::priority_queue<QueueItem, std::vector<QueueItem>,
+ QueueItemOrderer> Queue;
+
+ SHyperedge *CreateHyperedge(const std::vector<int> &);
+ void CreateNeighbour(const std::vector<int> &);
+ void CreateNeighbours(const std::vector<int> &);
+
+ const SHyperedgeBundle &m_bundle;
+ CoordinateSet m_visited;
+ Queue m_queue;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/CubeQueue.cpp b/moses/Syntax/CubeQueue.cpp
new file mode 100644
index 000000000..5bb8c2a98
--- /dev/null
+++ b/moses/Syntax/CubeQueue.cpp
@@ -0,0 +1,37 @@
+#include "CubeQueue.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+CubeQueue::~CubeQueue()
+{
+ while (!m_queue.empty()) {
+ Cube *cube = m_queue.top();
+ m_queue.pop();
+ delete cube;
+ }
+}
+
+SHyperedge *CubeQueue::Pop()
+{
+ // pop the most promising cube
+ Cube *cube = m_queue.top();
+ m_queue.pop();
+
+ // pop the most promising hyperedge from the cube
+ SHyperedge *hyperedge = cube->Pop();
+
+ // if the cube contains more items then push it back onto the queue
+ if (!cube->IsEmpty()) {
+ m_queue.push(cube);
+ } else {
+ delete cube;
+ }
+
+ return hyperedge;
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/CubeQueue.h b/moses/Syntax/CubeQueue.h
new file mode 100644
index 000000000..304e59409
--- /dev/null
+++ b/moses/Syntax/CubeQueue.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <queue>
+#include <vector>
+
+#include "Cube.h"
+#include "SHyperedge.h"
+#include "SHyperedgeBundle.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+class CubeQueue
+{
+ public:
+ template<typename InputIterator>
+ CubeQueue(InputIterator, InputIterator);
+
+ ~CubeQueue();
+
+ SHyperedge *Pop();
+
+ bool IsEmpty() const { return m_queue.empty(); }
+
+ private:
+ class CubeOrderer
+ {
+ public:
+ bool operator()(const Cube *p, const Cube *q) const {
+ return p->Top()->score < q->Top()->score;
+ }
+ };
+
+ typedef std::priority_queue<Cube*, std::vector<Cube*>, CubeOrderer> Queue;
+
+ Queue m_queue;
+};
+
+template<typename InputIterator>
+CubeQueue::CubeQueue(InputIterator first, InputIterator last)
+{
+ while (first != last) {
+ m_queue.push(new Cube(*first++));
+ }
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/KBestExtractor.cpp b/moses/Syntax/KBestExtractor.cpp
new file mode 100644
index 000000000..335d80409
--- /dev/null
+++ b/moses/Syntax/KBestExtractor.cpp
@@ -0,0 +1,317 @@
+#include "KBestExtractor.h"
+
+#include "moses/ScoreComponentCollection.h"
+#include "moses/StaticData.h"
+
+#include <boost/scoped_ptr.hpp>
+
+#include <vector>
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// Extract the k-best list from the search graph.
+void KBestExtractor::Extract(
+ const std::vector<boost::shared_ptr<SVertex> > &topLevelVertices,
+ std::size_t k, KBestVec &kBestList)
+{
+ kBestList.clear();
+ if (topLevelVertices.empty()) {
+ return;
+ }
+
+ // Create a new SVertex, supremeVertex, that has the best top-level SVertex as
+ // its predecessor and has the same score.
+ std::vector<boost::shared_ptr<SVertex> >::const_iterator p =
+ topLevelVertices.begin();
+ SVertex &bestTopLevelVertex = **p;
+ boost::scoped_ptr<SVertex> supremeVertex(new SVertex());
+ supremeVertex->pvertex = 0;
+ supremeVertex->best = new SHyperedge();
+ supremeVertex->best->head = supremeVertex.get();
+ supremeVertex->best->tail.push_back(&bestTopLevelVertex);
+ supremeVertex->best->score = bestTopLevelVertex.best->score;
+ supremeVertex->best->scoreBreakdown = bestTopLevelVertex.best->scoreBreakdown;
+ supremeVertex->best->translation = 0;
+
+ // For each alternative top-level SVertex, add a new incoming hyperedge to
+ // supremeVertex.
+ for (++p; p != topLevelVertices.end(); ++p) {
+ // Check that the first item in topLevelVertices really was the best.
+ UTIL_THROW_IF2((*p)->best->score > bestTopLevelVertex.best->score,
+ "top-level SVertices are not correctly sorted");
+ // Note: there's no need for a smart pointer here: supremeVertex will take
+ // ownership of altEdge.
+ SHyperedge *altEdge = new SHyperedge();
+ altEdge->head = supremeVertex.get();
+ altEdge->tail.push_back((*p).get());
+ altEdge->score = (*p)->best->score;
+ altEdge->scoreBreakdown = (*p)->best->scoreBreakdown;
+ altEdge->translation = 0;
+ supremeVertex->recombined.push_back(altEdge);
+ }
+
+ // Create the target vertex then lazily fill its k-best list.
+ boost::shared_ptr<KVertex> targetVertex = FindOrCreateVertex(*supremeVertex);
+ LazyKthBest(targetVertex, k, k);
+
+ // Copy the k-best list from the target vertex, but drop the top edge from
+ // each derivation.
+ kBestList.reserve(targetVertex->kBestList.size());
+ for (std::vector<boost::weak_ptr<Derivation> >::const_iterator
+ q = targetVertex->kBestList.begin();
+ q != targetVertex->kBestList.end(); ++q) {
+ const boost::shared_ptr<Derivation> d(*q);
+ assert(d);
+ assert(d->subderivations.size() == 1);
+ kBestList.push_back(d->subderivations[0]);
+ }
+}
+
+// Generate the target-side yield of the derivation d.
+Phrase KBestExtractor::GetOutputPhrase(const Derivation &d)
+{
+ FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
+
+ Phrase ret(ARRAY_SIZE_INCR);
+
+ const TargetPhrase &phrase = *(d.edge->shyperedge.translation);
+ const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
+ phrase.GetAlignNonTerm().GetNonTermIndexMap();
+ for (std::size_t pos = 0; pos < phrase.GetSize(); ++pos) {
+ const Word &word = phrase.GetWord(pos);
+ if (word.IsNonTerminal()) {
+ std::size_t nonTermInd = nonTermIndexMap[pos];
+ const Derivation &subderivation = *d.subderivations[nonTermInd];
+ Phrase subPhrase = GetOutputPhrase(subderivation);
+ ret.Append(subPhrase);
+ } else {
+ ret.AddWord(word);
+ if (placeholderFactor == NOT_FOUND) {
+ continue;
+ }
+ // FIXME
+ UTIL_THROW2("placeholders are not currently supported by the S2T decoder");
+/*
+ std::set<std::size_t> sourcePosSet =
+ phrase.GetAlignTerm().GetAlignmentsForTarget(pos);
+ if (sourcePosSet.size() == 1) {
+ const std::vector<const Word*> *ruleSourceFromInputPath =
+ hypo.GetTranslationOption().GetSourceRuleFromInputPath();
+ UTIL_THROW_IF2(ruleSourceFromInputPath == NULL,
+ "Source Words in of the rules hasn't been filled out");
+ std::size_t sourcePos = *sourcePosSet.begin();
+ const Word *sourceWord = ruleSourceFromInputPath->at(sourcePos);
+ UTIL_THROW_IF2(sourceWord == NULL,
+ "Null source word at position " << sourcePos);
+ const Factor *factor = sourceWord->GetFactor(placeholderFactor);
+ if (factor) {
+ ret.Back()[0] = factor;
+ }
+ }
+*/
+ }
+ }
+
+ return ret;
+}
+
+// Generate the target tree of the derivation d.
+TreePointer KBestExtractor::GetOutputTree(const Derivation &d)
+{
+ const TargetPhrase &phrase = *(d.edge->shyperedge.translation);
+ if (const PhraseProperty *property = phrase.GetProperty("Tree")) {
+ const std::string *tree = property->GetValueString();
+ TreePointer mytree (boost::make_shared<InternalTree>(*tree));
+
+ //get subtrees (in target order)
+ std::vector<TreePointer> previous_trees;
+ for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
+ const Word &word = phrase.GetWord(pos);
+ if (word.IsNonTerminal()) {
+ size_t nonTermInd = phrase.GetAlignNonTerm().GetNonTermIndexMap()[pos];
+ const Derivation &subderivation = *d.subderivations[nonTermInd];
+ const TreePointer prev_tree = GetOutputTree(subderivation);
+ previous_trees.push_back(prev_tree);
+ }
+ }
+
+ mytree->Combine(previous_trees);
+ return mytree;
+ }
+ else {
+ UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
+ }
+}
+
+// Look for the vertex corresponding to a given SVertex, creating
+// a new one if necessary.
+boost::shared_ptr<KBestExtractor::KVertex>
+KBestExtractor::FindOrCreateVertex(const SVertex &v)
+{
+ // KVertex nodes should not be created for terminal nodes.
+ assert(v.best);
+
+ VertexMap::value_type element(&v, boost::shared_ptr<KVertex>());
+ std::pair<VertexMap::iterator, bool> p = m_vertexMap.insert(element);
+ boost::shared_ptr<KVertex> &sp = p.first->second;
+ if (!p.second) {
+ return sp; // KVertex was already in m_vertexMap.
+ }
+ sp.reset(new KVertex(v));
+ // Create the 1-best derivation and add it to the vertex's kBestList.
+ boost::shared_ptr<KHyperedge> bestEdge(new KHyperedge(*(v.best)));
+ bestEdge->head = sp;
+ std::size_t kTailSize = 0;
+ for (std::size_t i = 0; i < v.best->tail.size(); ++i) {
+ const SVertex *pred = v.best->tail[i];
+ if (pred->best) {
+ ++kTailSize;
+ }
+ }
+ bestEdge->tail.reserve(kTailSize);
+ for (std::size_t i = 0; i < v.best->tail.size(); ++i) {
+ const SVertex *pred = v.best->tail[i];
+ if (pred->best) {
+ bestEdge->tail.push_back(FindOrCreateVertex(*pred));
+ }
+ }
+ boost::shared_ptr<Derivation> bestDerivation(new Derivation(bestEdge));
+#ifndef NDEBUG
+ std::pair<DerivationSet::iterator, bool> q =
+#endif
+ m_derivations.insert(bestDerivation);
+ assert(q.second);
+ sp->kBestList.push_back(bestDerivation);
+ return sp;
+}
+
+// Create the 1-best derivation for each edge in BS(v) (except the best one)
+// and add it to v's candidate queue.
+void KBestExtractor::GetCandidates(boost::shared_ptr<KVertex> v, std::size_t k)
+{
+ // Create 1-best derivations for all of v's incoming edges except the best.
+ // The 1-best derivation for that edge will already have been created.
+ for (std::size_t i = 0; i < v->svertex.recombined.size(); ++i) {
+ const SHyperedge &shyperedge = *(v->svertex.recombined[i]);
+ boost::shared_ptr<KHyperedge> bestEdge(new KHyperedge(shyperedge));
+ bestEdge->head = v;
+ // Count the number of incoming vertices that are not terminals.
+ std::size_t kTailSize = 0;
+ for (std::size_t j = 0; j < shyperedge.tail.size(); ++j) {
+ const SVertex *pred = shyperedge.tail[j];
+ if (pred->best) {
+ ++kTailSize;
+ }
+ }
+ bestEdge->tail.reserve(kTailSize);
+ for (std::size_t j = 0; j < shyperedge.tail.size(); ++j) {
+ const SVertex *pred = shyperedge.tail[j];
+ if (pred->best) {
+ bestEdge->tail.push_back(FindOrCreateVertex(*pred));
+ }
+ }
+ boost::shared_ptr<Derivation> derivation(new Derivation(bestEdge));
+#ifndef NDEBUG
+ std::pair<DerivationSet::iterator, bool> q =
+#endif
+ m_derivations.insert(derivation);
+ assert(q.second);
+ v->candidates.push(derivation);
+ }
+}
+
+// Lazily fill v's k-best list.
+void KBestExtractor::LazyKthBest(boost::shared_ptr<KVertex> v, std::size_t k,
+ std::size_t globalK)
+{
+ // If this is the first visit to vertex v then initialize the priority queue.
+ if (v->visited == false) {
+ // The 1-best derivation should already be in v's k-best list.
+ assert(v->kBestList.size() == 1);
+ // Initialize v's priority queue.
+ GetCandidates(v, globalK);
+ v->visited = true;
+ }
+ // Add derivations to the k-best list until it contains k or there are none
+ // left to add.
+ while (v->kBestList.size() < k) {
+ assert(!v->kBestList.empty());
+ // Update the priority queue by adding the successors of the last
+ // derivation (unless they've been seen before).
+ boost::shared_ptr<Derivation> d(v->kBestList.back());
+ LazyNext(*v, *d, globalK);
+ // Check if there are any derivations left in the queue.
+ if (v->candidates.empty()) {
+ break;
+ }
+ // Get the next best derivation and delete it from the queue.
+ boost::weak_ptr<Derivation> next = v->candidates.top();
+ v->candidates.pop();
+ // Add it to the k-best list.
+ v->kBestList.push_back(next);
+ }
+}
+
+// Create the neighbours of Derivation d and add them to v's candidate queue.
+void KBestExtractor::LazyNext(KVertex &v, const Derivation &d,
+ std::size_t globalK)
+{
+ for (std::size_t i = 0; i < d.edge->tail.size(); ++i) {
+ boost::shared_ptr<KVertex> pred = d.edge->tail[i];
+ // Ensure that pred's k-best list contains enough derivations.
+ std::size_t k = d.backPointers[i] + 2;
+ LazyKthBest(pred, k, globalK);
+ if (pred->kBestList.size() < k) {
+ // pred's derivations have been exhausted.
+ continue;
+ }
+ // Create the neighbour.
+ boost::shared_ptr<Derivation> next(new Derivation(d, i));
+ // Check if it has been created before.
+ std::pair<DerivationSet::iterator, bool> p = m_derivations.insert(next);
+ if (p.second) {
+ v.candidates.push(next); // Haven't previously seen it.
+ }
+ }
+}
+
+// Construct the 1-best Derivation that ends at edge e.
+KBestExtractor::Derivation::Derivation(const boost::shared_ptr<KHyperedge> &e)
+{
+ edge = e;
+ std::size_t arity = edge->tail.size();
+ backPointers.resize(arity, 0);
+ subderivations.reserve(arity);
+ for (std::size_t i = 0; i < arity; ++i) {
+ const KVertex &pred = *(edge->tail[i]);
+ assert(pred.kBestList.size() >= 1);
+ boost::shared_ptr<Derivation> sub(pred.kBestList[0]);
+ subderivations.push_back(sub);
+ }
+ score = edge->shyperedge.score;
+ scoreBreakdown = edge->shyperedge.scoreBreakdown;
+}
+
+// Construct a Derivation that neighbours an existing Derivation.
+KBestExtractor::Derivation::Derivation(const Derivation &d, std::size_t i)
+{
+ edge = d.edge;
+ backPointers = d.backPointers;
+ subderivations = d.subderivations;
+ std::size_t j = ++backPointers[i];
+ scoreBreakdown = d.scoreBreakdown;
+ // Deduct the score of the old subderivation.
+ scoreBreakdown.MinusEquals(subderivations[i]->scoreBreakdown);
+ // Update the subderivation pointer.
+ boost::shared_ptr<Derivation> newSub(edge->tail[i]->kBestList[j]);
+ subderivations[i] = newSub;
+ // Add the score of the new subderivation.
+ scoreBreakdown.PlusEquals(subderivations[i]->scoreBreakdown);
+ score = scoreBreakdown.GetWeightedScore();
+}
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/KBestExtractor.h b/moses/Syntax/KBestExtractor.h
new file mode 100644
index 000000000..21fb6f737
--- /dev/null
+++ b/moses/Syntax/KBestExtractor.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include <cassert>
+
+#include <queue>
+#include <vector>
+
+#include <boost/unordered_set.hpp>
+#include <boost/weak_ptr.hpp>
+
+#include "moses/ScoreComponentCollection.h"
+#include "moses/FF/InternalTree.h"
+
+#include "SHyperedge.h"
+#include "SVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// k-best list extractor that implements algorithm 3 from this paper:
+//
+// Liang Huang and David Chiang
+// "Better k-best parsing"
+// In Proceedings of IWPT 2005
+//
+class KBestExtractor
+{
+ public:
+ struct KVertex;
+
+ struct KHyperedge {
+ KHyperedge(const SHyperedge &e) : shyperedge(e) {}
+
+ const SHyperedge &shyperedge;
+ boost::shared_ptr<KVertex> head;
+ std::vector<boost::shared_ptr<KVertex> > tail;
+ };
+
+ struct Derivation {
+ Derivation(const boost::shared_ptr<KHyperedge> &);
+ Derivation(const Derivation &, std::size_t);
+
+ boost::shared_ptr<KHyperedge> edge;
+ std::vector<std::size_t> backPointers;
+ std::vector<boost::shared_ptr<Derivation> > subderivations;
+ ScoreComponentCollection scoreBreakdown;
+ float score;
+ };
+
+ struct DerivationOrderer {
+ bool operator()(const boost::weak_ptr<Derivation> &d1,
+ const boost::weak_ptr<Derivation> &d2) const {
+ boost::shared_ptr<Derivation> s1(d1);
+ boost::shared_ptr<Derivation> s2(d2);
+ return s1->score < s2->score;
+ }
+ };
+
+ struct KVertex {
+ typedef std::priority_queue<boost::weak_ptr<Derivation>,
+ std::vector<boost::weak_ptr<Derivation> >,
+ DerivationOrderer> DerivationQueue;
+
+ KVertex(const SVertex &v) : svertex(v), visited(false) {}
+
+ const SVertex &svertex;
+ std::vector<boost::weak_ptr<Derivation> > kBestList;
+ DerivationQueue candidates;
+ bool visited;
+ };
+
+ typedef std::vector<boost::shared_ptr<Derivation> > KBestVec;
+
+ // Extract the k-best list from the search hypergraph given the full, sorted
+ // list of top-level SVertices.
+ void Extract(const std::vector<boost::shared_ptr<SVertex> > &, std::size_t,
+ KBestVec &);
+
+ static Phrase GetOutputPhrase(const Derivation &);
+ static TreePointer GetOutputTree(const Derivation &);
+
+ private:
+ typedef boost::unordered_map<const SVertex *,
+ boost::shared_ptr<KVertex> > VertexMap;
+
+ struct DerivationHasher {
+ std::size_t operator()(const boost::shared_ptr<Derivation> &d) const {
+ std::size_t seed = 0;
+ boost::hash_combine(seed, &(d->edge->shyperedge));
+ boost::hash_combine(seed, d->backPointers);
+ return seed;
+ }
+ };
+
+ struct DerivationEqualityPred {
+ bool operator()(const boost::shared_ptr<Derivation> &d1,
+ const boost::shared_ptr<Derivation> &d2) const {
+ return &(d1->edge->shyperedge) == &(d2->edge->shyperedge) &&
+ d1->backPointers == d2->backPointers;
+ }
+ };
+
+ typedef boost::unordered_set<boost::shared_ptr<Derivation>, DerivationHasher,
+ DerivationEqualityPred> DerivationSet;
+
+ boost::shared_ptr<KVertex> FindOrCreateVertex(const SVertex &);
+ void GetCandidates(boost::shared_ptr<KVertex>, std::size_t);
+ void LazyKthBest(boost::shared_ptr<KVertex>, std::size_t, std::size_t);
+ void LazyNext(KVertex &, const Derivation &, std::size_t);
+
+ VertexMap m_vertexMap;
+ DerivationSet m_derivations;
+};
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/NonTerminalMap.h b/moses/Syntax/NonTerminalMap.h
new file mode 100644
index 000000000..ff7ce2508
--- /dev/null
+++ b/moses/Syntax/NonTerminalMap.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "moses/FactorCollection.h"
+#include "moses/Word.h"
+
+#include "SymbolEqualityPred.h"
+#include "SymbolHasher.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// Hybrid map/vector-based container for key-value pairs where the key is a
+// non-terminal Word. The interface is like a (stripped-down) map type, with
+// the main differences being that:
+// 1. Find() is implemented using vector indexing to make it fast.
+// 2. Once a value has been inserted it can be modified but can't be removed.
+template<typename T>
+class NonTerminalMap
+{
+ private:
+ typedef boost::unordered_map<Word, T, SymbolHasher, SymbolEqualityPred> Map;
+ typedef std::vector<T*> Vec;
+
+ public:
+ typedef typename Map::iterator Iterator;
+ typedef typename Map::const_iterator ConstIterator;
+
+ NonTerminalMap()
+ : m_vec(FactorCollection::Instance().GetNumNonTerminals(), NULL) {}
+
+ Iterator Begin() { return m_map.begin(); }
+ Iterator End() { return m_map.end(); }
+
+ ConstIterator Begin() const { return m_map.begin(); }
+ ConstIterator End() const { return m_map.end(); }
+
+ std::size_t Size() const { return m_map.size(); }
+
+ bool IsEmpty() const { return m_map.empty(); }
+
+ std::pair<Iterator, bool> Insert(const Word &, const T &);
+
+ T *Find(const Word &w) const { return m_vec[w[0]->GetId()]; }
+
+ private:
+ Map m_map;
+ Vec m_vec;
+};
+
+template<typename T>
+std::pair<typename NonTerminalMap<T>::Iterator, bool> NonTerminalMap<T>::Insert(
+ const Word &key, const T &value)
+{
+ std::pair<typename Map::iterator, bool> result =
+ m_map.insert(typename Map::value_type(key, value));
+ if (result.second) {
+ T *p = &(result.first->second);
+ std::size_t i = key[0]->GetId();
+ m_vec[i] = p;
+ }
+ return result;
+}
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/PHyperedge.h b/moses/Syntax/PHyperedge.h
new file mode 100644
index 000000000..8f236fcb8
--- /dev/null
+++ b/moses/Syntax/PHyperedge.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/TargetPhraseCollection.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct PVertex;
+
+struct PHyperedge
+{
+ PVertex *head;
+ std::vector<PVertex*> tail;
+ const TargetPhraseCollection *translations;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/PVertex.h b/moses/Syntax/PVertex.h
new file mode 100644
index 000000000..d82309c82
--- /dev/null
+++ b/moses/Syntax/PVertex.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "moses/Word.h"
+#include "moses/WordsRange.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct PVertex
+{
+ public:
+ PVertex(const WordsRange &wr, const Word &w) : span(wr), symbol(w) {}
+
+ WordsRange span;
+ Word symbol;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/RuleTable.h b/moses/Syntax/RuleTable.h
new file mode 100644
index 000000000..90a25d63c
--- /dev/null
+++ b/moses/Syntax/RuleTable.h
@@ -0,0 +1,24 @@
+#pragma once
+
+namespace Moses
+{
+namespace Syntax
+{
+
+class RuleTableFF;
+
+// Base class for any data structure representing a synchronous
+// grammar, like a trie (for S2T) or a DFA (for T2S).
+class RuleTable
+{
+ public:
+ RuleTable(const RuleTableFF *ff) : m_ff(ff) {}
+
+ virtual ~RuleTable() {}
+
+ protected:
+ const RuleTableFF *m_ff;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/RuleTableFF.cpp b/moses/Syntax/RuleTableFF.cpp
new file mode 100644
index 000000000..771c3983c
--- /dev/null
+++ b/moses/Syntax/RuleTableFF.cpp
@@ -0,0 +1,51 @@
+#include "RuleTableFF.h"
+
+#include "moses/StaticData.h"
+#include "moses/Syntax/S2T/RuleTrieCYKPlus.h"
+#include "moses/Syntax/S2T/RuleTrieLoader.h"
+#include "moses/Syntax/S2T/RuleTrieScope3.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+std::vector<RuleTableFF*> RuleTableFF::s_instances;
+
+RuleTableFF::RuleTableFF(const std::string &line)
+ : PhraseDictionary(line)
+{
+ ReadParameters();
+ // caching for memory pt is pointless
+ m_maxCacheSize = 0;
+
+ s_instances.push_back(this);
+}
+
+void RuleTableFF::Load()
+{
+ SetFeaturesToApply();
+
+ const StaticData &staticData = StaticData::Instance();
+ if (!staticData.UseS2TDecoder()) {
+ UTIL_THROW2("ERROR: RuleTableFF currently only supports S2T decoder");
+ } else {
+ S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
+ if (algorithm == RecursiveCYKPlus) {
+ S2T::RuleTrieCYKPlus *trie = new S2T::RuleTrieCYKPlus(this);
+ S2T::RuleTrieLoader loader;
+ loader.Load(m_input, m_output, m_filePath, *this, *trie);
+ m_table = trie;
+ } else if (algorithm == Scope3) {
+ S2T::RuleTrieScope3 *trie = new S2T::RuleTrieScope3(this);
+ S2T::RuleTrieLoader loader;
+ loader.Load(m_input, m_output, m_filePath, *this, *trie);
+ m_table = trie;
+ } else {
+ UTIL_THROW2("ERROR: unhandled S2T parsing algorithm");
+ }
+ }
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/RuleTableFF.h b/moses/Syntax/RuleTableFF.h
new file mode 100644
index 000000000..0e6040612
--- /dev/null
+++ b/moses/Syntax/RuleTableFF.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <string>
+
+#include "moses/TranslationModel/PhraseDictionary.h"
+
+namespace Moses
+{
+
+class ChartParser;
+class ChartCellCollectionBase;
+
+namespace Syntax
+{
+
+class RuleTable;
+
+// Feature function for dealing with local rule scores (that come from a
+// rule table). The scores themselves are stored on TargetPhrase objects
+// and the decoder accesses them directly, so this object doesn't really do
+// anything except provide somewhere to store the weights and parameter values.
+class RuleTableFF : public PhraseDictionary
+{
+ public:
+ RuleTableFF(const std::string &);
+
+ // FIXME Delete m_table?
+ ~RuleTableFF() {}
+
+ void Load();
+
+ const RuleTable *GetTable() const { return m_table; }
+
+ static const std::vector<RuleTableFF*> &Instances() { return s_instances; }
+
+ ChartRuleLookupManager *CreateRuleLookupManager(
+ const ChartParser &, const ChartCellCollectionBase &, std::size_t)
+ {
+ assert(false);
+ return 0;
+ }
+
+ private:
+ static std::vector<RuleTableFF*> s_instances;
+
+ const RuleTable *m_table;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/DerivationWriter.cpp b/moses/Syntax/S2T/DerivationWriter.cpp
new file mode 100644
index 000000000..dcb98b3c6
--- /dev/null
+++ b/moses/Syntax/S2T/DerivationWriter.cpp
@@ -0,0 +1,100 @@
+#include "DerivationWriter.h"
+
+#include "moses/Factor.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SHyperedge.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// 1-best version.
+void DerivationWriter::Write(const SHyperedge &shyperedge,
+ std::size_t sentNum, std::ostream &out)
+{
+ WriteLine(shyperedge, sentNum, out);
+ for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) {
+ const SVertex &pred = *(shyperedge.tail[i]);
+ if (pred.best) {
+ Write(*pred.best, sentNum, out);
+ }
+ }
+}
+
+// k-best derivation.
+void DerivationWriter::Write(const KBestExtractor::Derivation &derivation,
+ std::size_t sentNum, std::ostream &out)
+{
+ WriteLine(derivation.edge->shyperedge, sentNum, out);
+ for (std::size_t i = 0; i < derivation.subderivations.size(); ++i) {
+ Write(*(derivation.subderivations[i]), sentNum, out);
+ }
+}
+
+void DerivationWriter::WriteLine(const SHyperedge &shyperedge,
+ std::size_t sentNum, std::ostream &out)
+{
+ // Sentence number.
+ out << sentNum << " |||";
+
+ // Source LHS.
+ out << " [X] ->";
+
+ // Source RHS symbols.
+ for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) {
+ const Word &symbol = shyperedge.tail[i]->pvertex->symbol;
+ out << " ";
+ if (symbol.IsNonTerminal()) {
+ out << "[X]";
+ } else {
+ WriteSymbol(symbol, out);
+ }
+ }
+ out << " |||";
+
+ // Target RHS.
+ out << " ";
+ WriteSymbol(shyperedge.head->pvertex->symbol, out);
+ out << " ->";
+
+ // Target RHS symbols.
+ const TargetPhrase &phrase = *(shyperedge.translation);
+ for (std::size_t i = 0; i < phrase.GetSize(); ++i) {
+ out << " ";
+ WriteSymbol(phrase.GetWord(i), out);
+ }
+ out << " |||";
+
+ // Non-terminal alignments
+ const AlignmentInfo &a = phrase.GetAlignNonTerm();
+ for (AlignmentInfo::const_iterator p = a.begin(); p != a.end(); ++p) {
+ out << " " << p->first << "-" << p->second;
+ }
+ out << " |||";
+
+ // Spans covered by source RHS symbols.
+ for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) {
+ const SVertex *child = shyperedge.tail[i];
+ const WordsRange &span = child->pvertex->span;
+ out << " " << span.GetStartPos() << ".." << span.GetEndPos();
+ }
+
+ out << "\n";
+}
+
+void DerivationWriter::WriteSymbol(const Word &symbol, std::ostream &out)
+{
+ const Factor *f = symbol[0];
+ if (symbol.IsNonTerminal()) {
+ out << "[" << f->GetString() << "]";
+ } else {
+ out << f->GetString();
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/DerivationWriter.h b/moses/Syntax/S2T/DerivationWriter.h
new file mode 100644
index 000000000..706490ce0
--- /dev/null
+++ b/moses/Syntax/S2T/DerivationWriter.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <ostream>
+
+#include "moses/Syntax/KBestExtractor.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+struct SHyperedge;
+
+namespace S2T
+{
+
+// Writes a string representation of a derivation to a std::ostream. This is
+// used by the -translation-details / -T option.
+// TODO DerivationWriter currently assumes string-to-tree (which is why it's
+// TODO in the S2T namespace) but it would be easy to generalise it. This
+// TODO should be revisited when other the decoders are implemented.
+class DerivationWriter
+{
+ public:
+ // 1-best version.
+ static void Write(const SHyperedge&, std::size_t, std::ostream &);
+
+ // k-best version.
+ static void Write(const KBestExtractor::Derivation &, std::size_t,
+ std::ostream &);
+ private:
+ static void WriteLine(const SHyperedge &, std::size_t, std::ostream &);
+ static void WriteSymbol(const Word &, std::ostream &);
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Manager-inl.h b/moses/Syntax/S2T/Manager-inl.h
new file mode 100644
index 000000000..3351d1a9f
--- /dev/null
+++ b/moses/Syntax/S2T/Manager-inl.h
@@ -0,0 +1,599 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "moses/DecodeGraph.h"
+#include "moses/StaticData.h"
+#include "moses/Syntax/BoundedPriorityContainer.h"
+#include "moses/Syntax/CubeQueue.h"
+#include "moses/Syntax/PHyperedge.h"
+#include "moses/Syntax/RuleTable.h"
+#include "moses/Syntax/RuleTableFF.h"
+#include "moses/Syntax/SHyperedgeBundle.h"
+#include "moses/Syntax/SVertex.h"
+#include "moses/Syntax/SVertexRecombinationOrderer.h"
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "DerivationWriter.h"
+
+#include "OovHandler.h"
+#include "PChart.h"
+#include "RuleTrie.h"
+#include "SChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename Parser>
+Manager<Parser>::Manager(const InputType &source)
+ : BaseManager(source)
+ , m_pchart(source.GetSize(), Parser::RequiresCompressedChart())
+ , m_schart(source.GetSize())
+{
+}
+
+template<typename Parser>
+void Manager<Parser>::InitializeCharts()
+{
+ // Create a PVertex object and a SVertex object for each source word.
+ for (std::size_t i = 0; i < m_source.GetSize(); ++i) {
+ const Word &terminal = m_source.GetWord(i);
+
+ // PVertex
+ PVertex tmp(WordsRange(i,i), m_source.GetWord(i));
+ PVertex &pvertex = m_pchart.AddVertex(tmp);
+
+ // SVertex
+ boost::shared_ptr<SVertex> v(new SVertex());
+ v->best = 0;
+ v->pvertex = &pvertex;
+ SChart::Cell &scell = m_schart.GetCell(i,i);
+ SVertexStack stack(1, v);
+ SChart::Cell::TMap::value_type x(terminal, stack);
+ scell.terminalStacks.insert(x);
+ }
+}
+
+template<typename Parser>
+void Manager<Parser>::InitializeParsers(PChart &pchart,
+ std::size_t ruleLimit)
+{
+ const std::vector<RuleTableFF*> &ffs = RuleTableFF::Instances();
+
+ const std::vector<DecodeGraph*> &graphs =
+ StaticData::Instance().GetDecodeGraphs();
+
+ UTIL_THROW_IF2(ffs.size() != graphs.size(),
+ "number of RuleTables does not match number of decode graphs");
+
+ for (std::size_t i = 0; i < ffs.size(); ++i) {
+ RuleTableFF *ff = ffs[i];
+ std::size_t maxChartSpan = graphs[i]->GetMaxChartSpan();
+ // This may change in the future, but currently we assume that every
+ // RuleTableFF is associated with a static, file-based rule table of
+ // some sort and that the table should have been loaded into a RuleTable
+ // by this point.
+ const RuleTable *table = ff->GetTable();
+ assert(table);
+ RuleTable *nonConstTable = const_cast<RuleTable*>(table);
+ boost::shared_ptr<Parser> parser;
+ typename Parser::RuleTrie *trie =
+ dynamic_cast<typename Parser::RuleTrie*>(nonConstTable);
+ assert(trie);
+ parser.reset(new Parser(pchart, *trie, maxChartSpan));
+ m_parsers.push_back(parser);
+ }
+
+ // Check for OOVs and synthesize an additional rule trie + parser if
+ // necessary.
+ m_oovs.clear();
+ std::size_t maxOovWidth = 0;
+ FindOovs(pchart, m_oovs, maxOovWidth);
+ if (!m_oovs.empty()) {
+ // FIXME Add a hidden RuleTableFF for unknown words(?)
+ OovHandler<typename Parser::RuleTrie> oovHandler(*ffs[0]);
+ m_oovRuleTrie = oovHandler.SynthesizeRuleTrie(m_oovs.begin(), m_oovs.end());
+ // Create a parser for the OOV rule trie.
+ boost::shared_ptr<Parser> parser(
+ new Parser(pchart, *m_oovRuleTrie, maxOovWidth));
+ m_parsers.push_back(parser);
+ }
+}
+
+// Find the set of OOVs for this input. This function assumes that the
+// PChart argument has already been initialized from the input.
+template<typename Parser>
+void Manager<Parser>::FindOovs(const PChart &pchart, std::set<Word> &oovs,
+ std::size_t maxOovWidth)
+{
+ // Get the set of RuleTries.
+ std::vector<const RuleTrie *> tries;
+ const std::vector<RuleTableFF*> &ffs = RuleTableFF::Instances();
+ for (std::size_t i = 0; i < ffs.size(); ++i) {
+ const RuleTableFF *ff = ffs[i];
+ if (ff->GetTable()) {
+ const RuleTrie *trie = dynamic_cast<const RuleTrie*>(ff->GetTable());
+ assert(trie); // FIXME
+ tries.push_back(trie);
+ }
+ }
+
+ // For every sink vertex in pchart (except for <s> and </s>), check whether
+ // the word has a preterminal rule in any of the rule tables. If not then
+ // add it to the OOV set.
+ oovs.clear();
+ maxOovWidth = 0;
+ // Assume <s> and </s> have been added at sentence boundaries, so skip
+ // cells starting at position 0 and ending at the last position.
+ for (std::size_t i = 1; i < pchart.GetWidth()-1; ++i) {
+ for (std::size_t j = i; j < pchart.GetWidth()-1; ++j) {
+ std::size_t width = j-i+1;
+ const PChart::Cell::TMap &map = pchart.GetCell(i,j).terminalVertices;
+ for (PChart::Cell::TMap::const_iterator p = map.begin();
+ p != map.end(); ++p) {
+ const Word &word = p->first;
+ assert(!word.IsNonTerminal());
+ bool found = false;
+ for (std::vector<const RuleTrie *>::const_iterator q = tries.begin();
+ q != tries.end(); ++q) {
+ const RuleTrie *trie = *q;
+ if (trie->HasPreterminalRule(word)) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ oovs.insert(word);
+ maxOovWidth = std::max(maxOovWidth, width);
+ }
+ }
+ }
+ }
+}
+
+template<typename Parser>
+void Manager<Parser>::Decode()
+{
+ const StaticData &staticData = StaticData::Instance();
+
+ // Get various pruning-related constants.
+ const std::size_t popLimit = staticData.GetCubePruningPopLimit();
+ const std::size_t ruleLimit = staticData.GetRuleLimit();
+ const std::size_t stackLimit = staticData.GetMaxHypoStackSize();
+
+ // Initialise the PChart and SChart.
+ InitializeCharts();
+
+ // Initialize the parsers.
+ InitializeParsers(m_pchart, ruleLimit);
+
+ // Create a callback to process the PHyperedges produced by the parsers.
+ typename Parser::CallbackType callback(m_schart, ruleLimit);
+
+ // Visit each cell of PChart in right-to-left depth-first order.
+ std::size_t size = m_source.GetSize();
+ for (int start = size-1; start >= 0; --start) {
+ for (std::size_t width = 1; width <= size-start; ++width) {
+ std::size_t end = start + width - 1;
+
+ //PChart::Cell &pcell = m_pchart.GetCell(start, end);
+ SChart::Cell &scell = m_schart.GetCell(start, end);
+
+ WordsRange range(start, end);
+
+ // Call the parsers to generate PHyperedges for this span and convert
+ // each one to a SHyperedgeBundle (via the callback). The callback
+ // prunes the SHyperedgeBundles and keeps the best ones (up to ruleLimit).
+ callback.InitForRange(range);
+ for (typename std::vector<boost::shared_ptr<Parser> >::iterator
+ p = m_parsers.begin(); p != m_parsers.end(); ++p) {
+ (*p)->EnumerateHyperedges(range, callback);
+ }
+
+ // Retrieve the (pruned) set of SHyperedgeBundles from the callback.
+ const BoundedPriorityContainer<SHyperedgeBundle> &bundles =
+ callback.GetContainer();
+
+ // Use cube pruning to extract SHyperedges from SHyperedgeBundles.
+ // Collect the SHyperedges into buffers, one for each category.
+ CubeQueue cubeQueue(bundles.Begin(), bundles.End());
+ std::size_t count = 0;
+ typedef boost::unordered_map<Word, std::vector<SHyperedge*>,
+ SymbolHasher, SymbolEqualityPred > BufferMap;
+ BufferMap buffers;
+ while (count < popLimit && !cubeQueue.IsEmpty()) {
+ SHyperedge *hyperedge = cubeQueue.Pop();
+ // BEGIN{HACK}
+ // The way things currently work, the LHS of each hyperedge is not
+ // determined until just before the point of its creation, when a
+ // target phrase is selected from the list of possible phrases (which
+ // happens during cube pruning). The cube pruning code doesn't (and
+ // shouldn't) know about the contents of PChart and so creation of
+ // the PVertex is deferred until this point.
+ const Word &lhs = hyperedge->translation->GetTargetLHS();
+ hyperedge->head->pvertex = &m_pchart.AddVertex(PVertex(range, lhs));
+ // END{HACK}
+ buffers[lhs].push_back(hyperedge);
+ ++count;
+ }
+
+ // Recombine SVertices and sort into stacks.
+ for (BufferMap::const_iterator p = buffers.begin(); p != buffers.end();
+ ++p) {
+ const Word &category = p->first;
+ const std::vector<SHyperedge*> &buffer = p->second;
+ std::pair<SChart::Cell::NMap::Iterator, bool> ret =
+ scell.nonTerminalStacks.Insert(category, SVertexStack());
+ assert(ret.second);
+ SVertexStack &stack = ret.first->second;
+ RecombineAndSort(buffer, stack);
+ }
+
+ // Prune stacks.
+ if (stackLimit > 0) {
+ for (SChart::Cell::NMap::Iterator p = scell.nonTerminalStacks.Begin();
+ p != scell.nonTerminalStacks.End(); ++p) {
+ SVertexStack &stack = p->second;
+ if (stack.size() > stackLimit) {
+ stack.resize(stackLimit);
+ }
+ }
+ }
+
+ // Prune the PChart cell for this span by removing vertices for
+ // categories that don't occur in the SChart.
+// Note: see HACK above. Pruning the chart isn't currently necessary.
+// PrunePChart(scell, pcell);
+ }
+ }
+}
+
+template<typename Parser>
+const SHyperedge *Manager<Parser>::GetBestSHyperedge() const
+{
+ const SChart::Cell &cell = m_schart.GetCell(0, m_source.GetSize()-1);
+ const SChart::Cell::NMap &stacks = cell.nonTerminalStacks;
+ if (stacks.Size() == 0) {
+ return 0;
+ }
+ assert(stacks.Size() == 1);
+ const std::vector<boost::shared_ptr<SVertex> > &stack = stacks.Begin()->second;
+ return stack[0]->best;
+}
+
+template<typename Parser>
+void Manager<Parser>::ExtractKBest(
+ std::size_t k,
+ std::vector<boost::shared_ptr<KBestExtractor::Derivation> > &kBestList,
+ bool onlyDistinct) const
+{
+ kBestList.clear();
+ if (k == 0 || m_source.GetSize() == 0) {
+ return;
+ }
+
+ // Get the top-level SVertex stack.
+ const SChart::Cell &cell = m_schart.GetCell(0, m_source.GetSize()-1);
+ const SChart::Cell::NMap &stacks = cell.nonTerminalStacks;
+ if (stacks.Size() == 0) {
+ return;
+ }
+ assert(stacks.Size() == 1);
+ const std::vector<boost::shared_ptr<SVertex> > &stack = stacks.Begin()->second;
+
+ KBestExtractor extractor;
+
+ if (!onlyDistinct) {
+ // Return the k-best list as is, including duplicate translations.
+ extractor.Extract(stack, k, kBestList);
+ return;
+ }
+
+ // Determine how many derivations to extract. If the k-best list is
+ // restricted to distinct translations then this limit should be bigger
+ // than k. The k-best factor determines how much bigger the limit should be,
+ // with 0 being 'unlimited.' This actually sets a large-ish limit in case
+ // too many translations are identical.
+ const StaticData &staticData = StaticData::Instance();
+ const std::size_t nBestFactor = staticData.GetNBestFactor();
+ std::size_t numDerivations = (nBestFactor == 0) ? k*1000 : k*nBestFactor;
+
+ // Extract the derivations.
+ KBestExtractor::KBestVec bigList;
+ bigList.reserve(numDerivations);
+ extractor.Extract(stack, numDerivations, bigList);
+
+ // Copy derivations into kBestList, skipping ones with repeated translations.
+ std::set<Phrase> distinct;
+ for (KBestExtractor::KBestVec::const_iterator p = bigList.begin();
+ kBestList.size() < k && p != bigList.end(); ++p) {
+ boost::shared_ptr<KBestExtractor::Derivation> derivation = *p;
+ Phrase translation = KBestExtractor::GetOutputPhrase(*derivation);
+ if (distinct.insert(translation).second) {
+ kBestList.push_back(derivation);
+ }
+ }
+}
+
+template<typename Parser>
+void Manager<Parser>::PrunePChart(const SChart::Cell &scell,
+ PChart::Cell &pcell)
+{
+/* FIXME
+ PChart::Cell::VertexMap::iterator p = pcell.vertices.begin();
+ while (p != pcell.vertices.end()) {
+ const Word &category = p->first;
+ if (scell.stacks.find(category) == scell.stacks.end()) {
+ PChart::Cell::VertexMap::iterator q = p++;
+ pcell.vertices.erase(q);
+ } else {
+ ++p;
+ }
+ }
+*/
+}
+
+template<typename Parser>
+void Manager<Parser>::RecombineAndSort(const std::vector<SHyperedge*> &buffer,
+ SVertexStack &stack)
+{
+ // Step 1: Create a map containing a single instance of each distinct vertex
+ // (where distinctness is defined by the state value). The hyperedges'
+ // head pointers are updated to point to the vertex instances in the map and
+ // any 'duplicate' vertices are deleted.
+// TODO Set?
+ typedef std::map<SVertex *, SVertex *, SVertexRecombinationOrderer> Map;
+ Map map;
+ for (std::vector<SHyperedge*>::const_iterator p = buffer.begin();
+ p != buffer.end(); ++p) {
+ SHyperedge *h = *p;
+ SVertex *v = h->head;
+ assert(v->best == h);
+ assert(v->recombined.empty());
+ std::pair<Map::iterator, bool> result = map.insert(Map::value_type(v, v));
+ if (result.second) {
+ continue; // v's recombination value hasn't been seen before.
+ }
+ // v is a duplicate (according to the recombination rules).
+ // Compare the score of h against the score of the best incoming hyperedge
+ // for the stored vertex.
+ SVertex *storedVertex = result.first->second;
+ if (h->score > storedVertex->best->score) {
+ // h's score is better.
+ storedVertex->recombined.push_back(storedVertex->best);
+ storedVertex->best = h;
+ } else {
+ storedVertex->recombined.push_back(h);
+ }
+ h->head->best = 0;
+ delete h->head;
+ h->head = storedVertex;
+ }
+
+ // Step 2: Copy the vertices from the map to the stack.
+ stack.clear();
+ stack.reserve(map.size());
+ for (Map::const_iterator p = map.begin(); p != map.end(); ++p) {
+ stack.push_back(boost::shared_ptr<SVertex>(p->first));
+ }
+
+ // Step 3: Sort the vertices in the stack.
+ std::sort(stack.begin(), stack.end(), SVertexStackContentOrderer());
+}
+
+template<typename Parser>
+void Manager<Parser>::OutputNBest(OutputCollector *collector) const
+{
+ if (collector) {
+ const StaticData &staticData = StaticData::Instance();
+ long translationId = m_source.GetTranslationId();
+
+ Syntax::KBestExtractor::KBestVec nBestList;
+ ExtractKBest(staticData.GetNBestSize(), nBestList,
+ staticData.GetDistinctNBest());
+ OutputNBestList(collector, nBestList, translationId);
+ }
+
+}
+
+
+template<typename Parser>
+void Manager<Parser>::OutputDetailedTranslationReport(OutputCollector *collector) const
+{
+ const SHyperedge *best = GetBestSHyperedge();
+ if (best == NULL || collector == NULL) {
+ return;
+ }
+
+ long translationId = m_source.GetTranslationId();
+ std::ostringstream out;
+ Syntax::S2T::DerivationWriter::Write(*best, translationId, out);
+ collector->Write(translationId, out.str());
+
+}
+
+template<typename Parser>
+void Manager<Parser>::OutputUnknowns(OutputCollector *collector) const
+{
+ if (collector) {
+ long translationId = m_source.GetTranslationId();
+
+ std::ostringstream out;
+ for (std::set<Moses::Word>::const_iterator p = m_oovs.begin();
+ p != m_oovs.end(); ++p) {
+ out << *p;
+ }
+ out << std::endl;
+ collector->Write(translationId, out.str());
+ }
+
+}
+
+template<typename Parser>
+void Manager<Parser>::OutputNBestList(OutputCollector *collector,
+ const Syntax::KBestExtractor::KBestVec &nBestList,
+ long translationId) const
+{
+ const StaticData &staticData = StaticData::Instance();
+
+ const std::vector<Moses::FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder();
+
+ std::ostringstream out;
+
+ if (collector->OutputIsCout()) {
+ // Set precision only if we're writing the n-best list to cout. This is to
+ // preserve existing behaviour, but should probably be done either way.
+ FixPrecision(out);
+ }
+
+ bool includeWordAlignment =
+ staticData.PrintAlignmentInfoInNbest();
+
+ bool PrintNBestTrees = StaticData::Instance().PrintNBestTrees();
+
+ for (Syntax::KBestExtractor::KBestVec::const_iterator p = nBestList.begin();
+ p != nBestList.end(); ++p) {
+ const Syntax::KBestExtractor::Derivation &derivation = **p;
+
+ // get the derivation's target-side yield
+ Phrase outputPhrase = Syntax::KBestExtractor::GetOutputPhrase(derivation);
+
+ // delete <s> and </s>
+ UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+ outputPhrase.RemoveWord(0);
+ outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
+
+ // print the translation ID, surface factors, and scores
+ out << translationId << " ||| ";
+ OutputSurface(out, outputPhrase, outputFactorOrder, false);
+ out << " ||| ";
+ OutputAllFeatureScores(derivation.scoreBreakdown, out);
+ out << " ||| " << derivation.score;
+
+ // optionally, print word alignments
+ if (includeWordAlignment) {
+ out << " ||| ";
+ Alignments align;
+ OutputAlignmentNBest(align, derivation, 0);
+ for (Alignments::const_iterator q = align.begin(); q != align.end();
+ ++q) {
+ out << q->first << "-" << q->second << " ";
+ }
+ }
+
+ // optionally, print tree
+ if (PrintNBestTrees) {
+ TreePointer tree = Syntax::KBestExtractor::GetOutputTree(derivation);
+ out << " ||| " << tree->GetString();
+ }
+
+ out << std::endl;
+ }
+
+ assert(collector);
+ collector->Write(translationId, out.str());
+}
+
+template<typename Parser>
+size_t Manager<Parser>::OutputAlignmentNBest(
+ Alignments &retAlign,
+ const Syntax::KBestExtractor::Derivation &derivation,
+ size_t startTarget) const
+{
+ const Syntax::SHyperedge &shyperedge = derivation.edge->shyperedge;
+
+ size_t totalTargetSize = 0;
+ size_t startSource = shyperedge.head->pvertex->span.GetStartPos();
+
+ const TargetPhrase &tp = *(shyperedge.translation);
+
+ size_t thisSourceSize = CalcSourceSize(derivation);
+
+ // position of each terminal word in translation rule, irrespective of alignment
+ // if non-term, number is undefined
+ std::vector<size_t> sourceOffsets(thisSourceSize, 0);
+ std::vector<size_t> targetOffsets(tp.GetSize(), 0);
+
+ const AlignmentInfo &aiNonTerm = shyperedge.translation->GetAlignNonTerm();
+ std::vector<size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
+ const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap();
+
+ UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(),
+ "Error");
+
+ size_t targetInd = 0;
+ for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
+ if (tp.GetWord(targetPos).IsNonTerminal()) {
+ UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
+ size_t sourceInd = targetPos2SourceInd[targetPos];
+ size_t sourcePos = sourceInd2pos[sourceInd];
+
+ const Moses::Syntax::KBestExtractor::Derivation &subderivation =
+ *derivation.subderivations[sourceInd];
+
+ // calc source size
+ size_t sourceSize =
+ subderivation.edge->head->svertex.pvertex->span.GetNumWordsCovered();
+ sourceOffsets[sourcePos] = sourceSize;
+
+ // calc target size.
+ // Recursively look thru child hypos
+ size_t currStartTarget = startTarget + totalTargetSize;
+ size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
+ currStartTarget);
+ targetOffsets[targetPos] = targetSize;
+
+ totalTargetSize += targetSize;
+ ++targetInd;
+ } else {
+ ++totalTargetSize;
+ }
+ }
+
+ // convert position within translation rule to absolute position within
+ // source sentence / output sentence
+ ShiftOffsets(sourceOffsets, startSource);
+ ShiftOffsets(targetOffsets, startTarget);
+
+ // get alignments from this hypo
+ const AlignmentInfo &aiTerm = shyperedge.translation->GetAlignTerm();
+
+ // add to output arg, offsetting by source & target
+ AlignmentInfo::const_iterator iter;
+ for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+ const std::pair<size_t,size_t> &align = *iter;
+ size_t relSource = align.first;
+ size_t relTarget = align.second;
+ size_t absSource = sourceOffsets[relSource];
+ size_t absTarget = targetOffsets[relTarget];
+
+ std::pair<size_t, size_t> alignPoint(absSource, absTarget);
+ std::pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+ UTIL_THROW_IF2(!ret.second, "Error");
+ }
+
+ return totalTargetSize;
+}
+
+template<typename Parser>
+size_t Manager<Parser>::CalcSourceSize(const Syntax::KBestExtractor::Derivation &d) const
+{
+ using namespace Moses::Syntax;
+
+ const Syntax::SHyperedge &shyperedge = d.edge->shyperedge;
+ size_t ret = shyperedge.head->pvertex->span.GetNumWordsCovered();
+ for (size_t i = 0; i < shyperedge.tail.size(); ++i) {
+ size_t childSize = shyperedge.tail[i]->pvertex->span.GetNumWordsCovered();
+ ret -= (childSize - 1);
+ }
+ return ret;
+}
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/Manager.h b/moses/Syntax/S2T/Manager.h
new file mode 100644
index 000000000..096e3c142
--- /dev/null
+++ b/moses/Syntax/S2T/Manager.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+
+#include "moses/InputType.h"
+#include "moses/BaseManager.h"
+#include "moses/Syntax/KBestExtractor.h"
+#include "moses/Syntax/SVertexStack.h"
+
+#include "OovHandler.h"
+#include "ParserCallback.h"
+#include "PChart.h"
+#include "SChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+class SDerivation;
+struct SHyperedge;
+
+namespace S2T
+{
+
+template<typename Parser>
+class Manager : public BaseManager
+{
+ public:
+ Manager(const InputType &);
+
+ void Decode();
+
+ // Get the SHyperedge for the 1-best derivation.
+ const SHyperedge *GetBestSHyperedge() const;
+
+ void ExtractKBest(
+ std::size_t k,
+ std::vector<boost::shared_ptr<KBestExtractor::Derivation> > &kBestList,
+ bool onlyDistinct=false) const;
+
+ const std::set<Word> &GetUnknownWords() const { return m_oovs; }
+
+ void OutputNBest(OutputCollector *collector) const;
+ void OutputLatticeSamples(OutputCollector *collector) const
+ {}
+ void OutputAlignment(OutputCollector *collector) const
+ {}
+ void OutputDetailedTranslationReport(OutputCollector *collector) const;
+ void OutputUnknowns(OutputCollector *collector) const;
+ void OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const
+ {}
+ void OutputWordGraph(OutputCollector *collector) const
+ {}
+ void OutputSearchGraph(OutputCollector *collector) const
+ {}
+ void OutputSearchGraphSLF() const
+ {}
+ void OutputSearchGraphHypergraph() const
+ {}
+
+ private:
+ void FindOovs(const PChart &, std::set<Word> &, std::size_t);
+
+ void InitializeCharts();
+
+ void InitializeParsers(PChart &, std::size_t);
+
+ void RecombineAndSort(const std::vector<SHyperedge*> &, SVertexStack &);
+
+ void PrunePChart(const SChart::Cell &, PChart::Cell &);
+
+ PChart m_pchart;
+ SChart m_schart;
+ std::set<Word> m_oovs;
+ boost::shared_ptr<typename Parser::RuleTrie> m_oovRuleTrie;
+ std::vector<boost::shared_ptr<Parser> > m_parsers;
+
+ // output
+ void OutputNBestList(OutputCollector *collector,
+ const Moses::Syntax::KBestExtractor::KBestVec &nBestList,
+ long translationId) const;
+ std::size_t OutputAlignmentNBest(Alignments &retAlign,
+ const Moses::Syntax::KBestExtractor::Derivation &derivation,
+ std::size_t startTarget) const;
+ size_t CalcSourceSize(const Syntax::KBestExtractor::Derivation &d) const;
+
+};
+
+} // S2T
+} // Syntax
+} // Moses
+
+// Implementation
+#include "Manager-inl.h"
diff --git a/moses/Syntax/S2T/OovHandler-inl.h b/moses/Syntax/S2T/OovHandler-inl.h
new file mode 100644
index 000000000..e700f65c5
--- /dev/null
+++ b/moses/Syntax/S2T/OovHandler-inl.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include "moses/FF/UnknownWordPenaltyProducer.h"
+#include "moses/StaticData.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename RuleTrie>
+template<typename InputIterator>
+boost::shared_ptr<RuleTrie> OovHandler<RuleTrie>::SynthesizeRuleTrie(
+ InputIterator first, InputIterator last)
+{
+ const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
+
+ boost::shared_ptr<RuleTrie> trie(new RuleTrie(&m_ruleTableFF));
+
+ while (first != last) {
+ const Word &oov = *first++;
+ if (ShouldDrop(oov)) {
+ continue;
+ }
+ boost::scoped_ptr<Phrase> srcPhrase(SynthesizeSourcePhrase(oov));
+ for (UnknownLHSList::const_iterator p = lhsList.begin();
+ p != lhsList.end(); ++p) {
+ const std::string &targetLhsStr = p->first;
+ float prob = p->second;
+// TODO Check ownership and fix any leaks.
+ Word *tgtLHS = SynthesizeTargetLhs(targetLhsStr);
+ TargetPhrase *tp = SynthesizeTargetPhrase(oov, *srcPhrase, *tgtLHS, prob);
+ TargetPhraseCollection &tpc = GetOrCreateTargetPhraseCollection(
+ *trie, *srcPhrase, *tp, NULL); // TODO Check NULL is valid argument
+ tpc.Add(tp);
+ }
+ }
+
+ return trie;
+}
+
+template<typename RuleTrie>
+Phrase *OovHandler<RuleTrie>::SynthesizeSourcePhrase(const Word &sourceWord)
+{
+ Phrase *phrase = new Phrase(1);
+ phrase->AddWord() = sourceWord;
+ phrase->GetWord(0).SetIsOOV(true);
+ return phrase;
+}
+
+template<typename RuleTrie>
+Word *OovHandler<RuleTrie>::SynthesizeTargetLhs(const std::string &lhsStr)
+{
+ Word *targetLhs = new Word(true);
+ targetLhs->CreateFromString(Output,
+ StaticData::Instance().GetOutputFactorOrder(),
+ lhsStr, true);
+ UTIL_THROW_IF2(targetLhs->GetFactor(0) == NULL, "Null factor for target LHS");
+ return targetLhs;
+}
+
+template<typename RuleTrie>
+TargetPhrase *OovHandler<RuleTrie>::SynthesizeTargetPhrase(
+ const Word &oov, const Phrase &srcPhrase, const Word &targetLhs, float prob)
+{
+ const StaticData &staticData = StaticData::Instance();
+
+ const UnknownWordPenaltyProducer &unknownWordPenaltyProducer =
+ UnknownWordPenaltyProducer::Instance();
+
+ TargetPhrase *targetPhrase = new TargetPhrase();
+ Word &targetWord = targetPhrase->AddWord();
+ targetWord.CreateUnknownWord(oov);
+
+ // scores
+ float score = FloorScore(TransformScore(prob));
+
+ targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, score);
+ targetPhrase->EvaluateInIsolation(srcPhrase);
+ targetPhrase->SetTargetLHS(&targetLhs);
+ targetPhrase->SetAlignmentInfo("0-0");
+ if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() ||
+ staticData.GetTreeStructure() != NULL) {
+ std::string value = "[ " + targetLhs[0]->GetString().as_string() + " " +
+ oov[0]->GetString().as_string() + " ]";
+ targetPhrase->SetProperty("Tree", value);
+ }
+
+ return targetPhrase;
+}
+
+template<typename RuleTrie>
+bool OovHandler<RuleTrie>::ShouldDrop(const Word &oov)
+{
+ if (!StaticData::Instance().GetDropUnknown()) {
+ return false;
+ }
+ const Factor *f = oov[0]; // TODO hack. shouldn't know which factor is surface
+ const StringPiece s = f->GetString();
+ return s.find_first_of("0123456789") != std::string::npos;
+}
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/OovHandler.h b/moses/Syntax/S2T/OovHandler.h
new file mode 100644
index 000000000..b74e697c5
--- /dev/null
+++ b/moses/Syntax/S2T/OovHandler.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <string>
+
+#include <boost/shared_ptr.hpp>
+
+#include "moses/Syntax/RuleTableFF.h"
+#include "moses/TargetPhrase.h"
+#include "moses/Word.h"
+
+#include "RuleTrieCreator.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename RuleTrie>
+class OovHandler : public RuleTrieCreator
+{
+ public:
+ OovHandler(const RuleTableFF &ff) : m_ruleTableFF(ff) {}
+
+ // Synthesize a RuleTrie given a sequence of OOV words. The sequence is
+ // specified by a pair of iterators (indicating the beginning and end). It
+ // is assumed not to contain duplicates.
+ template<typename InputIterator>
+ boost::shared_ptr<RuleTrie> SynthesizeRuleTrie(InputIterator, InputIterator);
+
+ private:
+ const RuleTableFF &m_ruleTableFF;
+
+ bool ShouldDrop(const Word &);
+
+ Phrase *SynthesizeSourcePhrase(const Word &);
+
+ Word *SynthesizeTargetLhs(const std::string &);
+
+ TargetPhrase *SynthesizeTargetPhrase(const Word &, const Phrase &,
+ const Word &, float);
+};
+
+} // S2T
+} // Syntax
+} // Moses
+
+#include "OovHandler-inl.h"
diff --git a/moses/Syntax/S2T/PChart.cpp b/moses/Syntax/S2T/PChart.cpp
new file mode 100644
index 000000000..de62e7a84
--- /dev/null
+++ b/moses/Syntax/S2T/PChart.cpp
@@ -0,0 +1,34 @@
+#include "PChart.h"
+
+#include "moses/FactorCollection.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+PChart::PChart(std::size_t width, bool maintainCompressedChart)
+{
+ m_cells.resize(width);
+ for (std::size_t i = 0; i < width; ++i) {
+ m_cells[i].resize(width);
+ }
+ if (maintainCompressedChart) {
+ m_compressedChart = new CompressedChart(width);
+ for (CompressedChart::iterator p = m_compressedChart->begin();
+ p != m_compressedChart->end(); ++p) {
+ p->resize(FactorCollection::Instance().GetNumNonTerminals());
+ }
+ }
+}
+
+PChart::~PChart()
+{
+ delete m_compressedChart;
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/PChart.h b/moses/Syntax/S2T/PChart.h
new file mode 100644
index 000000000..8f719eebb
--- /dev/null
+++ b/moses/Syntax/S2T/PChart.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "moses/Syntax/NonTerminalMap.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class PChart
+{
+ public:
+ struct Cell
+ {
+ typedef boost::unordered_map<Word, PVertex, SymbolHasher,
+ SymbolEqualityPred> TMap;
+ typedef NonTerminalMap<PVertex> NMap;
+ // Collection of terminal vertices (keyed by terminal symbol).
+ TMap terminalVertices;
+ // Collection of non-terminal vertices (keyed by non-terminal symbol).
+ NMap nonTerminalVertices;
+ };
+
+ struct CompressedItem {
+ std::size_t end;
+ const PVertex *vertex;
+ };
+
+ typedef std::vector<std::vector<CompressedItem> > CompressedMatrix;
+
+ PChart(std::size_t width, bool maintainCompressedChart);
+
+ ~PChart();
+
+ std::size_t GetWidth() const { return m_cells.size(); }
+
+ const Cell &GetCell(std::size_t start, std::size_t end) const {
+ return m_cells[start][end];
+ }
+
+ // Insert the given PVertex and return a reference to the inserted object.
+ PVertex &AddVertex(const PVertex &v) {
+ const std::size_t start = v.span.GetStartPos();
+ const std::size_t end = v.span.GetEndPos();
+ Cell &cell = m_cells[start][end];
+ // If v is a terminal vertex add it to the cell's terminalVertices map.
+ if (!v.symbol.IsNonTerminal()) {
+ Cell::TMap::value_type x(v.symbol, v);
+ std::pair<Cell::TMap::iterator, bool> ret =
+ cell.terminalVertices.insert(x);
+ return ret.first->second;
+ }
+ // If v is a non-terminal vertex add it to the cell's nonTerminalVertices
+ // map and update the compressed chart (if enabled).
+ std::pair<Cell::NMap::Iterator, bool> result =
+ cell.nonTerminalVertices.Insert(v.symbol, v);
+ if (result.second && m_compressedChart) {
+ CompressedItem item;
+ item.end = end;
+ item.vertex = &(result.first->second);
+ (*m_compressedChart)[start][v.symbol[0]->GetId()].push_back(item);
+ }
+ return result.first->second;
+ }
+
+ const CompressedMatrix &GetCompressedMatrix(std::size_t start) const {
+ return (*m_compressedChart)[start];
+ }
+
+ private:
+ typedef std::vector<CompressedMatrix> CompressedChart;
+
+ std::vector<std::vector<Cell> > m_cells;
+ CompressedChart *m_compressedChart;
+};
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h b/moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h
new file mode 100644
index 000000000..dd0be3ae9
--- /dev/null
+++ b/moses/Syntax/S2T/PHyperedgeToSHyperedgeBundle.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "moses/Syntax/PHyperedge.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SHyperedgeBundle.h"
+
+#include "SChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Given a PHyperedge object and SChart produces a SHyperedgeBundle object.
+inline void PHyperedgeToSHyperedgeBundle(const PHyperedge &hyperedge,
+ const SChart &schart,
+ SHyperedgeBundle &bundle) {
+ bundle.translations = hyperedge.translations;
+ bundle.stacks.clear();
+ for (std::vector<PVertex*>::const_iterator p = hyperedge.tail.begin();
+ p != hyperedge.tail.end(); ++p) {
+ const PVertex *v = *p;
+ std::size_t spanStart = v->span.GetStartPos();
+ std::size_t spanEnd = v->span.GetEndPos();
+ const Word &symbol = v->symbol;
+ const SChart::Cell &cell = schart.GetCell(spanStart, spanEnd);
+ const SVertexStack *stack = 0;
+ if (symbol.IsNonTerminal()) {
+ stack = cell.nonTerminalStacks.Find(symbol);
+ } else {
+ const SChart::Cell::TMap::const_iterator q =
+ cell.terminalStacks.find(symbol);
+ assert(q != cell.terminalStacks.end());
+ stack = &(q->second);
+ }
+ bundle.stacks.push_back(stack);
+ }
+}
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/ParserCallback.h b/moses/Syntax/S2T/ParserCallback.h
new file mode 100644
index 000000000..b18a85eae
--- /dev/null
+++ b/moses/Syntax/S2T/ParserCallback.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include "moses/Syntax/BoundedPriorityContainer.h"
+#include "moses/Syntax/PHyperedge.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/SHyperedgeBundle.h"
+#include "moses/Syntax/SHyperedgeBundleScorer.h"
+
+#include "PHyperedgeToSHyperedgeBundle.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class StandardParserCallback {
+ private:
+ typedef BoundedPriorityContainer<SHyperedgeBundle> Container;
+
+ public:
+ StandardParserCallback(const SChart &schart, std::size_t ruleLimit)
+ : m_schart(schart)
+ , m_container(ruleLimit) {}
+
+ void operator()(const PHyperedge &hyperedge) {
+ PHyperedgeToSHyperedgeBundle(hyperedge, m_schart, m_tmpBundle);
+ float score = SHyperedgeBundleScorer::Score(m_tmpBundle);
+ m_container.SwapIn(m_tmpBundle, score);
+ }
+
+ void InitForRange(const WordsRange &range) { m_container.LazyClear(); }
+
+ const Container &GetContainer() { return m_container; }
+
+ private:
+ const SChart &m_schart;
+ SHyperedgeBundle m_tmpBundle;
+ BoundedPriorityContainer<SHyperedgeBundle> m_container;
+};
+
+class EagerParserCallback {
+ private:
+ typedef BoundedPriorityContainer<SHyperedgeBundle> Container;
+
+ public:
+ EagerParserCallback(const SChart &schart, std::size_t ruleLimit)
+ : m_schart(schart)
+ , m_containers(schart.GetWidth(), Container(ruleLimit))
+ , m_prevStart(std::numeric_limits<std::size_t>::max()) {}
+
+ void operator()(const PHyperedge &hyperedge, std::size_t end) {
+ PHyperedgeToSHyperedgeBundle(hyperedge, m_schart, m_tmpBundle);
+ float score = SHyperedgeBundleScorer::Score(m_tmpBundle);
+ m_containers[end].SwapIn(m_tmpBundle, score);
+ }
+
+ void InitForRange(const WordsRange &range) {
+ const std::size_t start = range.GetStartPos();
+ m_end = range.GetEndPos();
+ if (start != m_prevStart) {
+ for (std::vector<Container>::iterator p = m_containers.begin();
+ p != m_containers.end(); ++p) {
+ p->LazyClear();
+ }
+ m_prevStart = start;
+ }
+ }
+
+ const Container &GetContainer() { return m_containers[m_end]; }
+
+ private:
+ const SChart &m_schart;
+ SHyperedgeBundle m_tmpBundle;
+ std::vector<Container> m_containers;
+ std::size_t m_end;
+ std::size_t m_prevStart;
+};
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/Parsers/Parser.h b/moses/Syntax/S2T/Parsers/Parser.h
new file mode 100644
index 000000000..b13a8d502
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Parser.h
@@ -0,0 +1,30 @@
+#pragma once
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class PChart;
+
+// Base class for parsers.
+template<typename Callback>
+class Parser
+{
+ public:
+ typedef Callback CallbackType;
+
+ Parser(PChart &chart) : m_chart(chart) {}
+
+ virtual ~Parser() {}
+
+ virtual void EnumerateHyperedges(const WordsRange &, Callback &) = 0;
+ protected:
+ PChart &m_chart;
+};
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h b/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h
new file mode 100644
index 000000000..b275a93ee
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser-inl.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include "moses/Syntax/S2T/PChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename Callback>
+RecursiveCYKPlusParser<Callback>::RecursiveCYKPlusParser(
+ PChart &chart,
+ const RuleTrie &trie,
+ std::size_t maxChartSpan)
+ : Parser<Callback>(chart)
+ , m_ruleTable(trie)
+ , m_maxChartSpan(maxChartSpan)
+ , m_callback(NULL)
+{
+ m_hyperedge.head = 0;
+}
+
+template<typename Callback>
+void RecursiveCYKPlusParser<Callback>::EnumerateHyperedges(
+ const WordsRange &range,
+ Callback &callback)
+{
+ const std::size_t start = range.GetStartPos();
+ const std::size_t end = range.GetEndPos();
+ m_callback = &callback;
+ const RuleTrie::Node &rootNode = m_ruleTable.GetRootNode();
+ m_maxEnd = std::min(Base::m_chart.GetWidth()-1, start+m_maxChartSpan-1);
+ m_hyperedge.tail.clear();
+
+ // Find all hyperedges where the first incoming vertex is a terminal covering
+ // [start,end].
+ GetTerminalExtension(rootNode, start, end);
+
+ // Find all hyperedges where the first incoming vertex is a non-terminal
+ // covering [start,end-1].
+ if (end > start) {
+ GetNonTerminalExtensions(rootNode, start, end-1, end-1);
+ }
+}
+
+// Search for all extensions of a partial rule (pointed at by node) that begin
+// with a non-terminal over a span between [start,minEnd] and [start,maxEnd].
+template<typename Callback>
+void RecursiveCYKPlusParser<Callback>::GetNonTerminalExtensions(
+ const RuleTrie::Node &node,
+ std::size_t start,
+ std::size_t minEnd,
+ std::size_t maxEnd) {
+ // Non-terminal labels in node's outgoing edge set.
+ const RuleTrie::Node::SymbolMap &nonTermMap = node.GetNonTerminalMap();
+
+ // Compressed matrix from PChart.
+ const PChart::CompressedMatrix &matrix =
+ Base::m_chart.GetCompressedMatrix(start);
+
+ // Loop over possible expansions of the rule.
+ RuleTrie::Node::SymbolMap::const_iterator p;
+ RuleTrie::Node::SymbolMap::const_iterator p_end = nonTermMap.end();
+ for (p = nonTermMap.begin(); p != p_end; ++p) {
+ const Word &nonTerm = p->first;
+ const std::vector<PChart::CompressedItem> &items =
+ matrix[nonTerm[0]->GetId()];
+ for (std::vector<PChart::CompressedItem>::const_iterator q = items.begin();
+ q != items.end(); ++q) {
+ if (q->end >= minEnd && q->end <= maxEnd) {
+ const RuleTrie::Node &child = p->second;
+ AddAndExtend(child, q->end, *(q->vertex));
+ }
+ }
+ }
+}
+
+// Search for all extensions of a partial rule (pointed at by node) that begin
+// with a terminal over span [start,end].
+template<typename Callback>
+void RecursiveCYKPlusParser<Callback>::GetTerminalExtension(
+ const RuleTrie::Node &node,
+ std::size_t start,
+ std::size_t end) {
+
+ const PChart::Cell::TMap &vertexMap =
+ Base::m_chart.GetCell(start, end).terminalVertices;
+ if (vertexMap.empty()) {
+ return;
+ }
+
+ const RuleTrie::Node::SymbolMap &terminals = node.GetTerminalMap();
+
+ for (PChart::Cell::TMap::const_iterator p = vertexMap.begin();
+ p != vertexMap.end(); ++p) {
+ const Word &terminal = p->first;
+ const PVertex &vertex = p->second;
+
+ // if node has small number of terminal edges, test word equality for each.
+ if (terminals.size() < 5) {
+ for (RuleTrie::Node::SymbolMap::const_iterator iter = terminals.begin();
+ iter != terminals.end(); ++iter) {
+ const Word &word = iter->first;
+ if (word == terminal) {
+ const RuleTrie::Node *child = & iter->second;
+ AddAndExtend(*child, end, vertex);
+ break;
+ }
+ }
+ } else { // else, do hash lookup
+ const RuleTrie::Node *child = node.GetChild(terminal);
+ if (child != NULL) {
+ AddAndExtend(*child, end, vertex);
+ }
+ }
+ }
+}
+
+// If a (partial) rule matches, pass it to the callback (if non-unary and
+// non-empty), and try to find expansions that have this partial rule as prefix.
+template<typename Callback>
+void RecursiveCYKPlusParser<Callback>::AddAndExtend(
+ const RuleTrie::Node &node,
+ std::size_t end,
+ const PVertex &vertex) {
+ // FIXME Sort out const-ness.
+ m_hyperedge.tail.push_back(const_cast<PVertex *>(&vertex));
+
+ // Add target phrase collection (except if rule is empty or unary).
+ const TargetPhraseCollection &tpc = node.GetTargetPhraseCollection();
+ if (!tpc.IsEmpty() && !IsNonLexicalUnary(m_hyperedge)) {
+ m_hyperedge.translations = &tpc;
+ (*m_callback)(m_hyperedge, end);
+ }
+
+ // Get all further extensions of rule (until reaching end of sentence or
+ // max-chart-span).
+ if (end < m_maxEnd) {
+ if (!node.GetTerminalMap().empty()) {
+ for (std::size_t newEndPos = end+1; newEndPos <= m_maxEnd; newEndPos++) {
+ GetTerminalExtension(node, end+1, newEndPos);
+ }
+ }
+ if (!node.GetNonTerminalMap().empty()) {
+ GetNonTerminalExtensions(node, end+1, end+1, m_maxEnd);
+ }
+ }
+
+ m_hyperedge.tail.pop_back();
+}
+
+template<typename Callback>
+bool RecursiveCYKPlusParser<Callback>::IsNonLexicalUnary(
+ const PHyperedge &hyperedge) const
+{
+ return hyperedge.tail.size() == 1 &&
+ hyperedge.tail[0]->symbol.IsNonTerminal();
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h b/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h
new file mode 100644
index 000000000..264d43eea
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "moses/Syntax/PHyperedge.h"
+#include "moses/Syntax/PVertex.h"
+#include "moses/Syntax/S2T/Parsers/Parser.h"
+#include "moses/Syntax/S2T/RuleTrieCYKPlus.h"
+#include "moses/WordsRange.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Parser that implements the recursive variant of CYK+ from this paper:
+//
+// Rico Sennrich
+// "A CYK+ Variant for SCFG Decoding Without a Dot Chart"
+// In proceedings of SSST-8 2014
+//
+template<typename Callback>
+class RecursiveCYKPlusParser : public Parser<Callback>
+{
+ public:
+ typedef Parser<Callback> Base;
+ typedef RuleTrieCYKPlus RuleTrie;
+
+ // TODO Make this configurable?
+ static bool RequiresCompressedChart() { return true; }
+
+ RecursiveCYKPlusParser(PChart &, const RuleTrie &, std::size_t);
+
+ ~RecursiveCYKPlusParser() {}
+
+ void EnumerateHyperedges(const WordsRange &, Callback &);
+
+ private:
+
+ void GetTerminalExtension(const RuleTrie::Node &, std::size_t, std::size_t);
+
+ void GetNonTerminalExtensions(const RuleTrie::Node &, std::size_t,
+ std::size_t, std::size_t);
+
+ void AddAndExtend(const RuleTrie::Node &, std::size_t, const PVertex &);
+
+ bool IsNonLexicalUnary(const PHyperedge &) const;
+
+ const RuleTrie &m_ruleTable;
+ const std::size_t m_maxChartSpan;
+ std::size_t m_maxEnd;
+ PHyperedge m_hyperedge;
+ Callback *m_callback;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
+
+// Implementation
+#include "RecursiveCYKPlusParser-inl.h"
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h b/moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h
new file mode 100644
index 000000000..d55f7e842
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/Parser-inl.h
@@ -0,0 +1,185 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "moses/ChartParser.h"
+#include "moses/ChartTranslationOptionList.h"
+#include "moses/InputType.h"
+#include "moses/NonTerminal.h"
+#include "moses/StaticData.h"
+#include "moses/Syntax/S2T/Parsers/Parser.h"
+#include "moses/Syntax/S2T/PChart.h"
+
+#include "TailLatticeSearcher.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename Callback>
+Scope3Parser<Callback>::Scope3Parser(PChart &chart, const RuleTrie &trie,
+ std::size_t maxChartSpan)
+ : Parser<Callback>(chart)
+ , m_ruleTable(trie)
+ , m_maxChartSpan(maxChartSpan)
+ , m_latticeBuilder(chart)
+{
+ Init();
+}
+
+template<typename Callback>
+Scope3Parser<Callback>::~Scope3Parser()
+{
+ delete m_patRoot;
+}
+
+template<typename Callback>
+void Scope3Parser<Callback>::EnumerateHyperedges(const WordsRange &range,
+ Callback &callback)
+{
+ const std::size_t start = range.GetStartPos();
+ const std::size_t end = range.GetEndPos();
+
+ const std::vector<const PatternApplicationTrie *> &patNodes =
+ m_patSpans[start][end-start+1];
+
+ for (std::vector<const PatternApplicationTrie *>::const_iterator
+ p = patNodes.begin(); p != patNodes.end(); ++p) {
+ const PatternApplicationTrie *patNode = *p;
+
+ // Read off the sequence of PAT nodes ending at patNode.
+ patNode->ReadOffPatternApplicationKey(m_patKey);
+
+ // Calculate the start and end ranges for each symbol in the PAT key.
+ m_symbolRangeCalculator.Calc(m_patKey, start, end, m_symbolRanges);
+
+ // Build a lattice that encodes the set of PHyperedge tails that can be
+ // generated from this pattern + span.
+ m_latticeBuilder.Build(m_patKey, m_symbolRanges, m_lattice,
+ m_quickCheckTable);
+
+ // Ask the grammar for the mapping from label sequences to target phrase
+ // collections for this pattern.
+ const RuleTrie::Node::LabelMap &labelMap =
+ patNode->m_node->GetLabelMap();
+
+ // For each label sequence, search the lattice for the set of PHyperedge
+ // tails.
+ TailLatticeSearcher<Callback> searcher(m_lattice, m_patKey, m_symbolRanges);
+ RuleTrie::Node::LabelMap::const_iterator q = labelMap.begin();
+ for (; q != labelMap.end(); ++q) {
+ const std::vector<int> &labelSeq = q->first;
+ const TargetPhraseCollection &tpc = q->second;
+ // For many label sequences there won't be any corresponding paths through
+ // the lattice. As an optimisation, we use m_quickCheckTable to test
+ // for this and we don't begin a search if there are no paths to find.
+ bool failCheck = false;
+ std::size_t nonTermIndex = 0;
+ for (std::size_t i = 0; i < m_patKey.size(); ++i) {
+ if (m_patKey[i]->IsTerminalNode()) {
+ continue;
+ }
+ if (!m_quickCheckTable[nonTermIndex][labelSeq[nonTermIndex]]) {
+ failCheck = true;
+ break;
+ }
+ ++nonTermIndex;
+ }
+ if (failCheck) {
+ continue;
+ }
+ searcher.Search(labelSeq, tpc, callback);
+ }
+ }
+}
+
+template<typename Callback>
+void Scope3Parser<Callback>::Init()
+{
+ // Build a map from Words to PVertex sets.
+ SentenceMap sentMap;
+ FillSentenceMap(sentMap);
+
+ // Build the pattern application trie (PAT) for this input sentence.
+ const RuleTrie::Node &root = m_ruleTable.GetRootNode();
+ m_patRoot = new PatternApplicationTrie(-1, -1, root, 0, 0);
+ m_patRoot->Extend(root, -1, sentMap, false);
+
+ // Generate per-span lists of PAT node pointers.
+ InitRuleApplicationVector();
+ RecordPatternApplicationSpans(*m_patRoot);
+}
+
+/* TODO Rename */
+template<typename Callback>
+void Scope3Parser<Callback>::InitRuleApplicationVector()
+{
+ std::size_t length = Base::m_chart.GetWidth();
+ m_patSpans.resize(length);
+ for (std::size_t start = 0; start < length; ++start) {
+ std::size_t maxSpan = length-start;
+ m_patSpans[start].resize(maxSpan+1);
+ }
+}
+
+template<typename Callback>
+void Scope3Parser<Callback>::FillSentenceMap(SentenceMap &sentMap)
+{
+ typedef PChart::Cell Cell;
+
+ const std::size_t width = Base::m_chart.GetWidth();
+ for (std::size_t i = 0; i < width; ++i) {
+ for (std::size_t j = i; j < width; ++j) {
+ const Cell::TMap &map = Base::m_chart.GetCell(i, j).terminalVertices;
+ for (Cell::TMap::const_iterator p = map.begin(); p != map.end(); ++p) {
+ const Word &terminal = p->first;
+ const PVertex &v = p->second;
+ sentMap[terminal].push_back(&v);
+ }
+ }
+ }
+}
+
+template<typename Callback>
+void Scope3Parser<Callback>::RecordPatternApplicationSpans(
+ const PatternApplicationTrie &patNode)
+{
+ if (patNode.m_node->HasRules()) {
+ int s1 = -1;
+ int s2 = -1;
+ int e1 = -1;
+ int e2 = -1;
+ patNode.DetermineStartRange(Base::m_chart.GetWidth(), s1, s2);
+ patNode.DetermineEndRange(Base::m_chart.GetWidth(), e1, e2);
+
+ int minSpan = patNode.Depth();
+
+ // Add a PAT node pointer for each valid span in the range.
+ for (int i = s1; i <= s2; ++i) {
+ for (int j = std::max(e1, i+minSpan-1); j <= e2; ++j) {
+ std::size_t span = j-i+1;
+ assert(span >= 1);
+ if (span < minSpan) {
+ continue;
+ }
+ if (m_maxChartSpan && span > m_maxChartSpan) {
+ break;
+ }
+ m_patSpans[i][span].push_back(&patNode);
+ }
+ }
+ }
+
+ for (std::vector<PatternApplicationTrie*>::const_iterator p =
+ patNode.m_children.begin(); p != patNode.m_children.end(); ++p) {
+ RecordPatternApplicationSpans(**p);
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h b/moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h
new file mode 100644
index 000000000..d3104d9b1
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "moses/Syntax/S2T/Parsers/Parser.h"
+#include "moses/Syntax/S2T/RuleTrieScope3.h"
+#include "moses/WordsRange.h"
+
+#include "PatternApplicationTrie.h"
+#include "SymbolRangeCalculator.h"
+#include "TailLattice.h"
+#include "TailLatticeBuilder.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Parser that implements the algorithm described in this paper:
+//
+// Philip Williams and Philipp Koehn
+// "GHKM Rule Extraction and Scope-3 Parsing in Moses"
+// In proceedings of WMT 2012
+//
+template<typename Callback>
+class Scope3Parser : public Parser<Callback>
+{
+public:
+ typedef Parser<Callback> Base;
+ typedef RuleTrieScope3 RuleTrie;
+
+ // TODO Make this configurable?
+ static bool RequiresCompressedChart() { return false; }
+
+ Scope3Parser(PChart &, const RuleTrie &, std::size_t);
+
+ ~Scope3Parser();
+
+ void EnumerateHyperedges(const WordsRange &, Callback &);
+
+private:
+ void Init();
+ void InitRuleApplicationVector();
+ void FillSentenceMap(SentenceMap &);
+ void RecordPatternApplicationSpans(const PatternApplicationTrie &);
+
+ PatternApplicationTrie *m_patRoot;
+ std::vector<std::vector<bool> > m_quickCheckTable;
+ const RuleTrie &m_ruleTable;
+ const std::size_t m_maxChartSpan;
+ TailLattice m_lattice;
+ TailLatticeBuilder m_latticeBuilder;
+ SymbolRangeCalculator m_symbolRangeCalculator;
+ std::vector<SymbolRange> m_symbolRanges;
+ PatternApplicationKey m_patKey;
+
+ /* m_patSpans[i][j] records the set of all PAT nodes for span [i,i+j]
+ i.e. j is the width of the span */
+ std::vector<std::vector<
+ std::vector<const PatternApplicationTrie *> > > m_patSpans;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
+
+// Implementation
+#include "Parser-inl.h"
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp b/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp
new file mode 100644
index 000000000..218cd4017
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.cpp
@@ -0,0 +1,190 @@
+#include "PatternApplicationTrie.h"
+
+#include "moses/Syntax/PVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+int PatternApplicationTrie::Depth() const {
+ if (m_parent) {
+ return m_parent->Depth() + 1;
+ }
+ return 0;
+}
+
+const PatternApplicationTrie *
+PatternApplicationTrie::GetHighestTerminalNode() const
+{
+ // Check if result has been cached.
+ if (m_highestTerminalNode) {
+ return m_highestTerminalNode;
+ }
+ // It doesn't really make sense to call this on the root node. Just return 0.
+ if (!m_parent) {
+ return 0;
+ }
+ // Is this the highest non-root node?
+ if (!m_parent->m_parent) {
+ if (IsTerminalNode()) {
+ m_highestTerminalNode = this;
+ return this;
+ } else {
+ return 0;
+ }
+ }
+ // This is not the highest non-root node, so ask parent node.
+ if (const PatternApplicationTrie *p = m_parent->GetHighestTerminalNode()) {
+ m_highestTerminalNode = p;
+ return p;
+ }
+ // There are no terminal nodes higher than this node.
+ if (IsTerminalNode()) {
+ m_highestTerminalNode = this;
+ }
+ return m_highestTerminalNode;
+}
+
+const PatternApplicationTrie *
+PatternApplicationTrie::GetLowestTerminalNode() const
+{
+ // Check if result has been cached.
+ if (m_lowestTerminalNode) {
+ return m_lowestTerminalNode;
+ }
+ // It doesn't really make sense to call this on the root node. Just return 0.
+ if (!m_parent) {
+ return 0;
+ }
+ // Is this a terminal node?
+ if (IsTerminalNode()) {
+ m_lowestTerminalNode = this;
+ return this;
+ }
+ // Is this the highest non-root node?
+ if (!m_parent->m_parent) {
+ return 0;
+ }
+ // Ask parent node.
+ return m_parent->GetLowestTerminalNode();
+}
+
+// A node corresponds to a rule pattern that has been partially applied to a
+// sentence (the terminals have fixed positions, but the spans of gap symbols
+// may be unknown). This function determines the range of possible start
+// values for the partially-applied pattern.
+void PatternApplicationTrie::DetermineStartRange(int sentenceLength,
+ int &minStart,
+ int &maxStart) const
+{
+ // Find the leftmost terminal symbol, if any.
+ const PatternApplicationTrie *n = GetHighestTerminalNode();
+ if (!n) {
+ // The pattern contains only gap symbols.
+ minStart = 0;
+ maxStart = sentenceLength-Depth();
+ return;
+ }
+ assert(n->m_parent);
+ if (!n->m_parent->m_parent) {
+ // The pattern begins with a terminal symbol so the start position is
+ // fixed.
+ minStart = n->m_start;
+ maxStart = n->m_start;
+ } else {
+ // The pattern begins with a gap symbol but it contains at least one
+ // terminal symbol. The maximum start position is the start position of
+ // the leftmost terminal minus one position for each leading gap symbol.
+ minStart = 0;
+ maxStart = n->m_start - (n->Depth()-1);
+ }
+}
+
+// A node corresponds to a rule pattern that has been partially applied to a
+// sentence (the terminals have fixed positions, but the spans of gap symbols
+// may be unknown). This function determines the range of possible end values
+// for the partially-applied pattern.
+void PatternApplicationTrie::DetermineEndRange(int sentenceLength,
+ int &minEnd,
+ int &maxEnd) const
+{
+ // Find the rightmost terminal symbol, if any.
+ const PatternApplicationTrie *n = GetLowestTerminalNode();
+ if (!n) {
+ // The pattern contains only gap symbols.
+ minEnd = Depth()-1;
+ maxEnd = sentenceLength-1;
+ return;
+ }
+ if (n == this) {
+ // The pattern ends with a terminal symbol so the end position is fixed.
+ minEnd = m_end;
+ maxEnd = m_end;
+ } else {
+ // The pattern ends with a gap symbol but it contains at least one terminal
+ // symbol. The minimum end position is the end position of the rightmost
+ // terminal + one position for each trailing gap symbol.
+ minEnd = n->m_end + (Depth()-n->Depth());
+ maxEnd = sentenceLength-1;
+ }
+}
+
+void PatternApplicationTrie::Extend(const RuleTrieScope3::Node &node,
+ int minPos, const SentenceMap &sentMap,
+ bool followsGap)
+{
+ const RuleTrieScope3::Node::TerminalMap &termMap = node.GetTerminalMap();
+ for (RuleTrieScope3::Node::TerminalMap::const_iterator p = termMap.begin();
+ p != termMap.end(); ++p) {
+ const Word &word = p->first;
+ const RuleTrieScope3::Node &child = p->second;
+ SentenceMap::const_iterator q = sentMap.find(word);
+ if (q == sentMap.end()) {
+ continue;
+ }
+ for (std::vector<const PVertex *>::const_iterator r = q->second.begin();
+ r != q->second.end(); ++r) {
+ const PVertex *v = *r;
+ std::size_t start = v->span.GetStartPos();
+ std::size_t end = v->span.GetEndPos();
+ if (start == (std::size_t)minPos ||
+ (followsGap && start > (std::size_t)minPos) ||
+ minPos == -1) {
+ PatternApplicationTrie *subTrie =
+ new PatternApplicationTrie(start, end, child, v, this);
+ subTrie->Extend(child, end+1, sentMap, false);
+ m_children.push_back(subTrie);
+ }
+ }
+ }
+
+ const RuleTrieScope3::Node *child = node.GetNonTerminalChild();
+ if (!child) {
+ return;
+ }
+ int start = followsGap ? -1 : minPos;
+ PatternApplicationTrie *subTrie =
+ new PatternApplicationTrie(start, -1, *child, 0, this);
+ int newMinPos = (minPos == -1 ? 1 : minPos+1);
+ subTrie->Extend(*child, newMinPos, sentMap, true);
+ m_children.push_back(subTrie);
+}
+
+void PatternApplicationTrie::ReadOffPatternApplicationKey(
+ PatternApplicationKey &key) const {
+ const int depth = Depth();
+ key.resize(depth);
+ const PatternApplicationTrie *p = this;
+ std::size_t i = depth-1;
+ while (p->m_parent != 0) {
+ key[i--] = p;
+ p = p->m_parent;
+ }
+}
+
+} // namespace S2T
+} // namespace Moses
+} // namespace Syntax
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h b/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h
new file mode 100644
index 000000000..0ad371367
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/PatternApplicationTrie.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/Syntax/S2T/RuleTrieScope3.h"
+#include "moses/Util.h"
+
+#include "SentenceMap.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+struct PatternApplicationTrie;
+
+typedef std::vector<const PatternApplicationTrie*> PatternApplicationKey;
+
+struct PatternApplicationTrie {
+ public:
+ PatternApplicationTrie(int start, int end, const RuleTrieScope3::Node &node,
+ const PVertex *pvertex, PatternApplicationTrie *parent)
+ : m_start(start)
+ , m_end(end)
+ , m_node(&node)
+ , m_pvertex(pvertex)
+ , m_parent(parent)
+ , m_highestTerminalNode(0)
+ , m_lowestTerminalNode(0) {}
+
+ ~PatternApplicationTrie() {
+ RemoveAllInColl(m_children);
+ }
+
+ int Depth() const;
+
+ bool IsGapNode() const { return m_end == -1; }
+ bool IsTerminalNode() const { return m_end != -1; }
+
+ const PatternApplicationTrie *GetHighestTerminalNode() const;
+ const PatternApplicationTrie *GetLowestTerminalNode() const;
+
+ void DetermineStartRange(int, int &, int &) const;
+ void DetermineEndRange(int, int &, int &) const;
+
+ void Extend(const RuleTrieScope3::Node &node, int minPos,
+ const SentenceMap &sentMap, bool followsGap);
+
+ void ReadOffPatternApplicationKey(PatternApplicationKey &) const;
+
+ int m_start;
+ int m_end;
+ const RuleTrieScope3::Node *m_node;
+ const PVertex *m_pvertex;
+ PatternApplicationTrie *m_parent;
+ std::vector<PatternApplicationTrie*> m_children;
+ mutable const PatternApplicationTrie *m_highestTerminalNode;
+ mutable const PatternApplicationTrie *m_lowestTerminalNode;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h b/moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h
new file mode 100644
index 000000000..8e6aae9f1
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/SentenceMap.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct PVertex;
+
+namespace S2T
+{
+
+// FIXME Check SymbolHasher does the right thing here
+typedef boost::unordered_map<Word, std::vector<const PVertex *>, SymbolHasher,
+ SymbolEqualityPred> SentenceMap;
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h
new file mode 100644
index 000000000..ccb0d6521
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRange.h
@@ -0,0 +1,21 @@
+#pragma once
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Describes the range of possible start and end positions for a symbol
+// belonging to a node in a PatternApplicationTrie.
+struct SymbolRange {
+ int minStart;
+ int maxStart;
+ int minEnd;
+ int maxEnd;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp
new file mode 100644
index 000000000..0eb615db8
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.cpp
@@ -0,0 +1,160 @@
+#include "SymbolRangeCalculator.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+void SymbolRangeCalculator::Calc(const PatternApplicationKey &key,
+ int spanStart, int spanEnd,
+ std::vector<SymbolRange> &ranges)
+{
+ FillInTerminalRanges(key, ranges);
+ FillInAuxSymbolInfo(ranges);
+ FillInGapRanges(key, spanStart, spanEnd, ranges);
+}
+
+// Fill in ranges for terminals and set ranges to -1 for non-terminals.
+void SymbolRangeCalculator::FillInTerminalRanges(
+ const PatternApplicationKey &key, std::vector<SymbolRange> &ranges)
+{
+ ranges.resize(key.size());
+ for (std::size_t i = 0; i < key.size(); ++i) {
+ const PatternApplicationTrie *patNode = key[i];
+ if (patNode->IsTerminalNode()) {
+ ranges[i].minStart = ranges[i].maxStart = patNode->m_start;
+ ranges[i].minEnd = ranges[i].maxEnd = patNode->m_end;
+ } else {
+ ranges[i].minStart = ranges[i].maxStart = -1;
+ ranges[i].minEnd = ranges[i].maxEnd = -1;
+ }
+ }
+}
+
+void SymbolRangeCalculator::FillInAuxSymbolInfo(
+ const std::vector<SymbolRange> &ranges)
+{
+ m_auxSymbolInfo.resize(ranges.size());
+
+ // Forward pass: set distanceToPrevTerminal.
+ int distanceToPrevTerminal = -1;
+ for (std::size_t i = 0; i < ranges.size(); ++i) {
+ const SymbolRange &range = ranges[i];
+ AuxSymbolInfo &auxInfo = m_auxSymbolInfo[i];
+ if (range.minStart != -1) {
+ // Symbol i is a terminal.
+ assert(range.maxStart == range.minStart);
+ distanceToPrevTerminal = 1;
+ // Distances are not used for terminals so set auxInfo value to -1.
+ auxInfo.distanceToPrevTerminal = -1;
+ } else if (distanceToPrevTerminal == -1) {
+ // Symbol i is a non-terminal and there are no preceding terminals.
+ auxInfo.distanceToPrevTerminal = -1;
+ } else {
+ // Symbol i is a non-terminal and there is a preceding terminal.
+ auxInfo.distanceToPrevTerminal = distanceToPrevTerminal++;
+ }
+ }
+
+ // Backward pass: set distanceToNextTerminal
+ int distanceToNextTerminal = -1;
+ for (std::size_t j = ranges.size(); j > 0; --j) {
+ std::size_t i = j-1;
+ const SymbolRange &range = ranges[i];
+ AuxSymbolInfo &auxInfo = m_auxSymbolInfo[i];
+ if (range.minStart != -1) {
+ // Symbol i is a terminal.
+ assert(range.maxStart == range.minStart);
+ distanceToNextTerminal = 1;
+ // Distances are not used for terminals so set auxInfo value to -1.
+ auxInfo.distanceToNextTerminal = -1;
+ } else if (distanceToNextTerminal == -1) {
+ // Symbol i is a non-terminal and there are no succeeding terminals.
+ auxInfo.distanceToNextTerminal = -1;
+ } else {
+ // Symbol i is a non-terminal and there is a succeeding terminal.
+ auxInfo.distanceToNextTerminal = distanceToNextTerminal++;
+ }
+ }
+}
+
+void SymbolRangeCalculator::FillInGapRanges(const PatternApplicationKey &key,
+ int spanStart, int spanEnd,
+ std::vector<SymbolRange> &ranges)
+{
+ for (std::size_t i = 0; i < key.size(); ++i) {
+ const PatternApplicationTrie *patNode = key[i];
+
+ if (patNode->IsTerminalNode()) {
+ continue;
+ }
+
+ SymbolRange &range = ranges[i];
+ AuxSymbolInfo &auxInfo = m_auxSymbolInfo[i];
+
+ // Determine minimum start position.
+ if (auxInfo.distanceToPrevTerminal == -1) {
+ // There are no preceding terminals in pattern.
+ range.minStart = spanStart + i;
+ } else {
+ // There is at least one preceeding terminal in the pattern.
+ int j = i - auxInfo.distanceToPrevTerminal;
+ assert(ranges[j].minEnd == ranges[j].maxEnd);
+ range.minStart = ranges[j].maxEnd + auxInfo.distanceToPrevTerminal;
+ }
+
+ // Determine maximum start position.
+ if (i == 0) {
+ // Gap is leftmost symbol in pattern.
+ range.maxStart = spanStart;
+ } else if (auxInfo.distanceToPrevTerminal == 1) {
+ // Gap follows terminal so start position is fixed.
+ range.maxStart = ranges[i-1].maxEnd + 1;
+ } else if (auxInfo.distanceToNextTerminal == -1) {
+ // There are no succeeding terminals in the pattern.
+ int numFollowingGaps = (ranges.size()-1) - i;
+ range.maxStart = spanEnd - numFollowingGaps;
+ } else {
+ // There is at least one succeeding terminal in the pattern.
+ int j = i + auxInfo.distanceToNextTerminal;
+ range.maxStart = ranges[j].minStart - auxInfo.distanceToNextTerminal;
+ }
+
+ // Determine minimum end position.
+ if (i+1 == key.size()) {
+ // Gap is rightmost symbol in pattern.
+ range.minEnd = spanEnd;
+ } else if (auxInfo.distanceToNextTerminal == 1) {
+ // Gap immediately precedes terminal.
+ range.minEnd = ranges[i+1].minStart - 1;
+ } else if (auxInfo.distanceToPrevTerminal == -1) {
+ // There are no preceding terminals in pattern.
+ range.minEnd = spanStart + i;
+ } else {
+ // There is at least one preceeding terminal in the pattern.
+ int j = i - auxInfo.distanceToPrevTerminal;
+ assert(ranges[j].minEnd == ranges[j].maxEnd);
+ range.minEnd = ranges[j].maxEnd + auxInfo.distanceToPrevTerminal;
+ }
+
+ // Determine maximum end position.
+ if (i+1 == key.size()) {
+ // Gap is rightmost symbol in pattern.
+ range.maxEnd = spanEnd;
+ } else if (auxInfo.distanceToNextTerminal == -1) {
+ // There are no succeeding terminals in the pattern.
+ int numFollowingGaps = (ranges.size()-1) - i;
+ range.maxEnd = spanEnd - numFollowingGaps;
+ } else {
+ // There is at least one succeeding terminal in the pattern.
+ int j = i + auxInfo.distanceToNextTerminal;
+ range.maxEnd = ranges[j].minStart - auxInfo.distanceToNextTerminal;
+ }
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h
new file mode 100644
index 000000000..341fb9bb4
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/SymbolRangeCalculator.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <vector>
+
+#include "PatternApplicationTrie.h"
+#include "SymbolRange.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class SymbolRangeCalculator
+{
+ public:
+ void Calc(const PatternApplicationKey &, int, int,
+ std::vector<SymbolRange> &);
+
+ private:
+ // Provides contextual information used in determining a symbol's range.
+ struct AuxSymbolInfo {
+ int distanceToNextTerminal;
+ int distanceToPrevTerminal;
+ };
+
+ void FillInTerminalRanges(const PatternApplicationKey &,
+ std::vector<SymbolRange> &);
+
+ void FillInAuxSymbolInfo(const std::vector<SymbolRange> &);
+
+ void FillInGapRanges(const PatternApplicationKey &, int, int,
+ std::vector<SymbolRange> &);
+
+ std::vector<AuxSymbolInfo> m_auxSymbolInfo;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h
new file mode 100644
index 000000000..9ee16b186
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLattice.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <utility>
+#include <vector>
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+/* Lattice in which a full path corresponds to the tail of a PHyperedge.
+ * For an entry x[i][j][k][l] in a TailLattice x:
+ *
+ * i = offset from start of rule pattern
+ *
+ * j = index of gap + 1 (zero indicates a terminal, otherwise the index is
+ * zero-based from the left of the rule pattern)
+ *
+ * k = arc width
+ *
+ * l = label index (zero for terminals, otherwise as in RuleTrieScope3::Node)
+ */
+typedef std::vector<
+ std::vector<
+ std::vector<
+ std::vector<const PVertex *> > > > TailLattice;
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp
new file mode 100644
index 000000000..6b31090fc
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.cpp
@@ -0,0 +1,131 @@
+#include "TailLatticeBuilder.h"
+
+#include "moses/Syntax/S2T/RuleTrieScope3.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+void TailLatticeBuilder::Build(
+ const std::vector<const PatternApplicationTrie *> &key,
+ const std::vector<SymbolRange> &ranges,
+ TailLattice &lattice,
+ std::vector<std::vector<bool> > &checkTable)
+{
+ assert(key.size() == ranges.size());
+ assert(key.size() > 0);
+
+ ExtendAndClear(key, ranges, lattice, checkTable);
+
+ const int spanStart = ranges.front().minStart;
+
+ const RuleTrieScope3::Node *utrieNode = key.back()->m_node;
+
+ const RuleTrieScope3::Node::LabelTable &labelTable =
+ utrieNode->GetLabelTable();
+
+ std::size_t nonTermIndex = 0;
+
+ for (std::size_t i = 0; i < ranges.size(); ++i) {
+ const SymbolRange &range = ranges[i];
+ const PatternApplicationTrie &patNode = *(key[i]);
+ if (patNode.IsTerminalNode()) {
+ std::size_t offset = range.minStart - spanStart;
+ std::size_t width = range.minEnd - range.minStart + 1;
+ assert(lattice[offset][0][width].empty());
+ lattice[offset][0][width].push_back(patNode.m_pvertex);
+ continue;
+ }
+ const std::vector<Word> &labelVec = labelTable[nonTermIndex];
+ assert(checkTable[nonTermIndex].size() == labelVec.size());
+ for (int s = range.minStart; s <= range.maxStart; ++s) {
+ for (int e = std::max(s, range.minEnd); e <= range.maxEnd; ++e) {
+ assert(e-s >= 0);
+ std::size_t offset = s - spanStart;
+ std::size_t width = e - s + 1;
+ assert(lattice[offset][nonTermIndex+1][width].empty());
+ std::vector<bool>::iterator q = checkTable[nonTermIndex].begin();
+ for (std::vector<Word>::const_iterator p = labelVec.begin();
+ p != labelVec.end(); ++p, ++q) {
+ const Word &label = *p;
+ const PVertex *v =
+ m_chart.GetCell(s, e).nonTerminalVertices.Find(label);
+ lattice[offset][nonTermIndex+1][width].push_back(v);
+ *q = (*q || static_cast<bool>(v));
+ }
+ }
+ }
+ ++nonTermIndex;
+ }
+}
+
+// Extend the lattice if necessary and clear the innermost vectors.
+void TailLatticeBuilder::ExtendAndClear(
+ const std::vector<const PatternApplicationTrie *> &key,
+ const std::vector<SymbolRange> &ranges,
+ TailLattice &lattice,
+ std::vector<std::vector<bool> > &checkTable)
+{
+ const int spanStart = ranges.front().minStart;
+ const int spanEnd = ranges.back().maxEnd;
+
+ const std::size_t span = spanEnd - spanStart + 1;
+
+ // Extend the outermost vector.
+ if (lattice.size() < span) {
+ lattice.resize(span);
+ }
+
+ const RuleTrieScope3::Node *utrieNode = key.back()->m_node;
+ const RuleTrieScope3::Node::LabelTable &labelTable =
+ utrieNode->GetLabelTable();
+
+ std::size_t nonTermIndex = 0;
+
+ for (std::size_t i = 0; i < ranges.size(); ++i) {
+ const SymbolRange &range = ranges[i];
+ const PatternApplicationTrie &patNode = *(key[i]);
+ if (patNode.IsTerminalNode()) {
+ std::size_t offset = range.minStart - spanStart;
+ std::size_t width = range.minEnd - range.minStart + 1;
+ if (lattice[offset].size() < 1) {
+ lattice[offset].resize(1);
+ }
+ if (lattice[offset][0].size() < width+1) {
+ lattice[offset][0].resize(width+1);
+ }
+ lattice[offset][0][width].clear();
+ continue;
+ }
+ const std::vector<Word> &labelVec = labelTable[nonTermIndex];
+ for (int s = range.minStart; s <= range.maxStart; ++s) {
+ for (int e = std::max(s, range.minEnd); e <= range.maxEnd; ++e) {
+ assert(e-s >= 0);
+ std::size_t offset = s - spanStart;
+ std::size_t width = e - s + 1;
+ if (lattice[offset].size() < nonTermIndex+2) {
+ lattice[offset].resize(nonTermIndex+2);
+ }
+ if (lattice[offset][nonTermIndex+1].size() < width+1) {
+ lattice[offset][nonTermIndex+1].resize(width+1);
+ }
+ lattice[offset][nonTermIndex+1][width].clear();
+ lattice[offset][nonTermIndex+1][width].reserve(labelVec.size());
+ }
+ }
+ if (checkTable.size() < nonTermIndex+1) {
+ checkTable.resize(nonTermIndex+1);
+ }
+ // Unlike the lattice itself, the check table must contain initial
+ // values prior to the main build procedure (and the values must be false).
+ checkTable[nonTermIndex].assign(labelVec.size(), false);
+ ++nonTermIndex;
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h
new file mode 100644
index 000000000..c61df8a40
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeBuilder.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/Syntax/S2T/PChart.h"
+
+#include "PatternApplicationTrie.h"
+#include "SymbolRange.h"
+#include "TailLattice.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class TailLatticeBuilder
+{
+ public:
+ TailLatticeBuilder(PChart &chart) : m_chart(chart) {}
+
+ // Given a key from a PatternApplicationTrie and the valid ranges of its
+ // symbols, construct a TailLattice.
+ void Build(const std::vector<const PatternApplicationTrie *> &,
+ const std::vector<SymbolRange> &,
+ TailLattice &, std::vector<std::vector<bool> > &);
+
+ private:
+ // Auxiliary function used by Build. Enlarges a TailLattice, if necessary,
+ // and clears the innermost vectors.
+ void ExtendAndClear(const std::vector<const PatternApplicationTrie *> &,
+ const std::vector<SymbolRange> &,
+ TailLattice &, std::vector<std::vector<bool> > &);
+
+ PChart &m_chart;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h
new file mode 100644
index 000000000..a2897ce73
--- /dev/null
+++ b/moses/Syntax/S2T/Parsers/Scope3Parser/TailLatticeSearcher.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "moses/Syntax/PHyperedge.h"
+
+#include "TailLattice.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+template<typename Callback>
+class TailLatticeSearcher
+{
+ public:
+ TailLatticeSearcher(const TailLattice &lattice,
+ const PatternApplicationKey &key,
+ const std::vector<SymbolRange> &ranges)
+ : m_lattice(lattice)
+ , m_key(key)
+ , m_ranges(ranges) {}
+
+ void Search(const std::vector<int> &labels, const TargetPhraseCollection &tpc,
+ Callback &callback) {
+ m_labels = &labels;
+ m_matchCB = &callback;
+ m_hyperedge.head = 0;
+ m_hyperedge.tail.clear();
+ m_hyperedge.translations = &tpc;
+ SearchInner(0, 0, 0);
+ }
+
+ private:
+ void SearchInner(int offset, std::size_t i, std::size_t nonTermIndex) {
+ assert(m_hyperedge.tail.size() == i);
+
+ const PatternApplicationTrie *patNode = m_key[i];
+ const SymbolRange &range = m_ranges[i];
+
+ if (patNode->IsTerminalNode()) {
+ const int width = range.minEnd - range.minStart + 1;
+ const PVertex *v = m_lattice[offset][0][width][0];
+ // FIXME Sort out const-ness
+ m_hyperedge.tail.push_back(const_cast<PVertex*>(v));
+ if (i == m_key.size()-1) {
+ (*m_matchCB)(m_hyperedge);
+ } else {
+ SearchInner(offset+width, i+1, nonTermIndex);
+ }
+ m_hyperedge.tail.pop_back();
+ return;
+ }
+
+ const int absStart = m_ranges[0].minStart + offset;
+ const int minWidth = std::max(1, range.minEnd - absStart + 1);
+ const int maxWidth = range.maxEnd - absStart + 1;
+
+ const std::vector<std::vector<const PVertex *> > &innerVec =
+ m_lattice[offset][nonTermIndex+1];
+
+ std::size_t labelIndex = (*m_labels)[nonTermIndex];
+
+ // Loop over all possible widths for this offset and index.
+ for (std::size_t width = minWidth; width <= maxWidth; ++width) {
+ const PVertex *v = innerVec[width][labelIndex];
+ if (!v) {
+ continue;
+ }
+ // FIXME Sort out const-ness
+ m_hyperedge.tail.push_back(const_cast<PVertex*>(v));
+ if (i == m_key.size()-1) {
+ (*m_matchCB)(m_hyperedge);
+ } else {
+ SearchInner(offset+width, i+1, nonTermIndex+1);
+ }
+ m_hyperedge.tail.pop_back();
+ }
+ }
+
+ const TailLattice &m_lattice;
+ const PatternApplicationKey &m_key;
+ const std::vector<SymbolRange> &m_ranges;
+ const std::vector<int> *m_labels;
+ Callback *m_matchCB;
+ PHyperedge m_hyperedge;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrie.h b/moses/Syntax/S2T/RuleTrie.h
new file mode 100644
index 000000000..8f6dcbb80
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrie.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <cstddef>
+
+#include "moses/Syntax/RuleTable.h"
+
+namespace Moses
+{
+
+class Phrase;
+class TargetPhrase;
+class TargetPhraseCollection;
+class Word;
+
+namespace Syntax
+{
+namespace S2T
+{
+
+// Base class for parser-specific trie types.
+class RuleTrie : public RuleTable
+{
+ public:
+ RuleTrie(const RuleTableFF *ff) : RuleTable(ff) {}
+
+ virtual bool HasPreterminalRule(const Word &) const = 0;
+
+ private:
+ friend class RuleTrieCreator;
+
+ virtual TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS) = 0;
+
+ virtual void SortAndPrune(std::size_t) = 0;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieCYKPlus.cpp b/moses/Syntax/S2T/RuleTrieCYKPlus.cpp
new file mode 100644
index 000000000..9a300e9eb
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieCYKPlus.cpp
@@ -0,0 +1,151 @@
+#include "RuleTrieCYKPlus.h"
+
+#include <map>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+
+#include "moses/NonTerminal.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+void RuleTrieCYKPlus::Node::Prune(std::size_t tableLimit)
+{
+ // recusively prune
+ for (SymbolMap::iterator p = m_sourceTermMap.begin();
+ p != m_sourceTermMap.end(); ++p) {
+ p->second.Prune(tableLimit);
+ }
+ for (SymbolMap::iterator p = m_nonTermMap.begin();
+ p != m_nonTermMap.end(); ++p) {
+ p->second.Prune(tableLimit);
+ }
+
+ // prune TargetPhraseCollection in this node
+ m_targetPhraseCollection.Prune(true, tableLimit);
+}
+
+void RuleTrieCYKPlus::Node::Sort(std::size_t tableLimit)
+{
+ // recusively sort
+ for (SymbolMap::iterator p = m_sourceTermMap.begin();
+ p != m_sourceTermMap.end(); ++p) {
+ p->second.Sort(tableLimit);
+ }
+ for (SymbolMap::iterator p = m_nonTermMap.begin();
+ p != m_nonTermMap.end(); ++p) {
+ p->second.Sort(tableLimit);
+ }
+
+ // prune TargetPhraseCollection in this node
+ m_targetPhraseCollection.Sort(true, tableLimit);
+}
+
+RuleTrieCYKPlus::Node *RuleTrieCYKPlus::Node::GetOrCreateChild(
+ const Word &sourceTerm)
+{
+ return &m_sourceTermMap[sourceTerm];
+}
+
+RuleTrieCYKPlus::Node *RuleTrieCYKPlus::Node::GetOrCreateNonTerminalChild(const Word &targetNonTerm)
+{
+ UTIL_THROW_IF2(!targetNonTerm.IsNonTerminal(),
+ "Not a non-terminal: " << targetNonTerm);
+
+ return &m_nonTermMap[targetNonTerm];
+}
+
+const RuleTrieCYKPlus::Node *RuleTrieCYKPlus::Node::GetChild(
+ const Word &sourceTerm) const
+{
+ UTIL_THROW_IF2(sourceTerm.IsNonTerminal(),
+ "Not a terminal: " << sourceTerm);
+
+ SymbolMap::const_iterator p = m_sourceTermMap.find(sourceTerm);
+ return (p == m_sourceTermMap.end()) ? NULL : &p->second;
+}
+
+const RuleTrieCYKPlus::Node *RuleTrieCYKPlus::Node::GetNonTerminalChild(
+ const Word &targetNonTerm) const
+{
+ UTIL_THROW_IF2(!targetNonTerm.IsNonTerminal(),
+ "Not a non-terminal: " << targetNonTerm);
+
+ SymbolMap::const_iterator p = m_nonTermMap.find(targetNonTerm);
+ return (p == m_nonTermMap.end()) ? NULL : &p->second;
+}
+
+TargetPhraseCollection &RuleTrieCYKPlus::GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
+{
+ Node &currNode = GetOrCreateNode(source, target, sourceLHS);
+ return currNode.GetTargetPhraseCollection();
+}
+
+RuleTrieCYKPlus::Node &RuleTrieCYKPlus::GetOrCreateNode(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
+{
+ const std::size_t size = source.GetSize();
+
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+ AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
+
+ Node *currNode = &m_root;
+ for (std::size_t pos = 0 ; pos < size ; ++pos) {
+ const Word& word = source.GetWord(pos);
+
+ if (word.IsNonTerminal()) {
+ // indexed by source label 1st
+ const Word &sourceNonTerm = word;
+
+ UTIL_THROW_IF2(iterAlign == alignmentInfo.end(),
+ "No alignment for non-term at position " << pos);
+ UTIL_THROW_IF2(iterAlign->first != pos,
+ "Alignment info incorrect at position " << pos);
+
+ std::size_t targetNonTermInd = iterAlign->second;
+ ++iterAlign;
+ const Word &targetNonTerm = target.GetWord(targetNonTermInd);
+ currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
+ } else {
+ currNode = currNode->GetOrCreateChild(word);
+ }
+
+ UTIL_THROW_IF2(currNode == NULL,
+ "Node not found at position " << pos);
+ }
+
+ // finally, the source LHS
+ //currNode = currNode->GetOrCreateChild(sourceLHS);
+
+ return *currNode;
+}
+
+void RuleTrieCYKPlus::SortAndPrune(std::size_t tableLimit)
+{
+ if (tableLimit) {
+ m_root.Sort(tableLimit);
+ }
+}
+
+bool RuleTrieCYKPlus::HasPreterminalRule(const Word &w) const
+{
+ const Node::SymbolMap &map = m_root.GetTerminalMap();
+ Node::SymbolMap::const_iterator p = map.find(w);
+ return p != map.end() && p->second.HasRules();
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieCYKPlus.h b/moses/Syntax/S2T/RuleTrieCYKPlus.h
new file mode 100644
index 000000000..83ea55b87
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieCYKPlus.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Terminal.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+
+#include "RuleTrie.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class RuleTrieCYKPlus : public RuleTrie
+{
+ public:
+ class Node
+ {
+ public:
+ typedef boost::unordered_map<Word, Node, SymbolHasher,
+ SymbolEqualityPred> SymbolMap;
+
+ bool IsLeaf() const {
+ return m_sourceTermMap.empty() && m_nonTermMap.empty();
+ }
+
+ bool HasRules() const { return !m_targetPhraseCollection.IsEmpty(); }
+
+ void Prune(std::size_t tableLimit);
+ void Sort(std::size_t tableLimit);
+
+ Node *GetOrCreateChild(const Word &sourceTerm);
+ Node *GetOrCreateNonTerminalChild(const Word &targetNonTerm);
+
+ const Node *GetChild(const Word &sourceTerm) const;
+ const Node *GetNonTerminalChild(const Word &targetNonTerm) const;
+
+ const TargetPhraseCollection &GetTargetPhraseCollection() const {
+ return m_targetPhraseCollection;
+ }
+
+ TargetPhraseCollection &GetTargetPhraseCollection() {
+ return m_targetPhraseCollection;
+ }
+
+ const SymbolMap &GetTerminalMap() const { return m_sourceTermMap; }
+
+ const SymbolMap &GetNonTerminalMap() const { return m_nonTermMap; }
+
+ private:
+ SymbolMap m_sourceTermMap;
+ SymbolMap m_nonTermMap;
+ TargetPhraseCollection m_targetPhraseCollection;
+ };
+
+ RuleTrieCYKPlus(const RuleTableFF *ff) : RuleTrie(ff) {}
+
+ const Node &GetRootNode() const { return m_root; }
+
+ bool HasPreterminalRule(const Word &) const;
+
+ private:
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
+
+ Node &GetOrCreateNode(const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS);
+
+ void SortAndPrune(std::size_t);
+
+ Node m_root;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieCreator.h b/moses/Syntax/S2T/RuleTrieCreator.h
new file mode 100644
index 000000000..1fe99e609
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieCreator.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "RuleTrie.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+// Base for classes that create a RuleTrie (currently RuleTrieLoader and
+// OovHandler). RuleTrieCreator is a friend of RuleTrie.
+class RuleTrieCreator
+{
+ protected:
+ // Provide access to RuleTrie's private SortAndPrune function.
+ void SortAndPrune(RuleTrie &trie, std::size_t limit) {
+ trie.SortAndPrune(limit);
+ }
+
+ // Provide access to RuleTrie's private GetOrCreateTargetPhraseCollection
+ // function.
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ RuleTrie &trie, const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS) {
+ return trie.GetOrCreateTargetPhraseCollection(source, target, sourceLHS);
+ }
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieLoader.cpp b/moses/Syntax/S2T/RuleTrieLoader.cpp
new file mode 100644
index 000000000..8efa4969b
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieLoader.cpp
@@ -0,0 +1,156 @@
+#include "RuleTrieLoader.h"
+
+#include <sys/stat.h>
+#include <stdlib.h>
+
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <iostream>
+#include <math.h>
+
+#include "moses/FactorCollection.h"
+#include "moses/Word.h"
+#include "moses/Util.h"
+#include "moses/InputFileStream.h"
+#include "moses/StaticData.h"
+#include "moses/WordsRange.h"
+#include "moses/UserMessage.h"
+#include "moses/ChartTranslationOptionList.h"
+#include "moses/FactorCollection.h"
+#include "moses/Syntax/RuleTableFF.h"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/double-conversion/double-conversion.h"
+#include "util/exception.hh"
+
+#include "RuleTrie.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+bool RuleTrieLoader::Load(const std::vector<FactorType> &input,
+ const std::vector<FactorType> &output,
+ const std::string &inFile,
+ const RuleTableFF &ff,
+ RuleTrie &trie)
+{
+ PrintUserTime(std::string("Start loading text phrase table. Moses format"));
+
+ const StaticData &staticData = StaticData::Instance();
+ const std::string &factorDelimiter = staticData.GetFactorDelimiter();
+
+ std::size_t count = 0;
+
+ std::ostream *progress = NULL;
+ IFVERBOSE(1) progress = &std::cerr;
+ util::FilePiece in(inFile.c_str(), progress);
+
+ // reused variables
+ std::vector<float> scoreVector;
+ StringPiece line;
+
+ double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
+
+ while(true) {
+ try {
+ line = in.ReadLine();
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
+
+ util::TokenIter<util::MultiCharacter> pipes(line, "|||");
+ StringPiece sourcePhraseString(*pipes);
+ StringPiece targetPhraseString(*++pipes);
+ StringPiece scoreString(*++pipes);
+
+ StringPiece alignString;
+ if (++pipes) {
+ StringPiece temp(*pipes);
+ alignString = temp;
+ }
+
+ if (++pipes) {
+ StringPiece str(*pipes); //counts
+ }
+
+ bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
+ if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
+ TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
+ continue;
+ }
+
+ scoreVector.clear();
+ for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+ int processed;
+ float score = converter.StringToFloat(s->data(), s->length(), &processed);
+ UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
+ scoreVector.push_back(FloorScore(TransformScore(score)));
+ }
+ const size_t numScoreComponents = ff.GetNumScoreComponents();
+ if (scoreVector.size() != numScoreComponents) {
+ UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
+ << numScoreComponents << ") of score components on line " << count);
+ }
+
+ // parse source & find pt node
+
+ // constituent labels
+ Word *sourceLHS = NULL;
+ Word *targetLHS;
+
+ // create target phrase obj
+ TargetPhrase *targetPhrase = new TargetPhrase(&ff);
+ // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
+ targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
+ // source
+ Phrase sourcePhrase;
+ // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
+ sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
+
+ // rest of target phrase
+ targetPhrase->SetAlignmentInfo(alignString);
+ targetPhrase->SetTargetLHS(targetLHS);
+
+ //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
+
+ if (++pipes) {
+ StringPiece sparseString(*pipes);
+ targetPhrase->SetSparseScore(&ff, sparseString);
+ }
+
+ if (++pipes) {
+ StringPiece propertiesString(*pipes);
+ targetPhrase->SetProperties(propertiesString);
+ }
+
+ targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
+ targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());
+
+ TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(
+ trie, sourcePhrase, *targetPhrase, sourceLHS);
+ phraseColl.Add(targetPhrase);
+
+ // not implemented correctly in memory pt. just delete it for now
+ delete sourceLHS;
+
+ count++;
+ }
+
+ // sort and prune each target phrase collection
+ if (ff.GetTableLimit()) {
+ SortAndPrune(trie, ff.GetTableLimit());
+ }
+
+ return true;
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieLoader.h b/moses/Syntax/S2T/RuleTrieLoader.h
new file mode 100644
index 000000000..c625f91d6
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieLoader.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <istream>
+#include <vector>
+
+#include "moses/TypeDef.h"
+#include "moses/Syntax/RuleTableFF.h"
+
+#include "RuleTrie.h"
+#include "RuleTrieCreator.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class RuleTrieLoader : public RuleTrieCreator
+{
+ public:
+ bool Load(const std::vector<FactorType> &input,
+ const std::vector<FactorType> &output,
+ const std::string &inFile,
+ const RuleTableFF &,
+ RuleTrie &);
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieScope3.cpp b/moses/Syntax/S2T/RuleTrieScope3.cpp
new file mode 100644
index 000000000..a16cbefdc
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieScope3.cpp
@@ -0,0 +1,153 @@
+#include "RuleTrieScope3.h"
+
+#include <map>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+
+#include "moses/NonTerminal.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+void RuleTrieScope3::Node::Prune(std::size_t tableLimit)
+{
+ // Recusively prune child node values.
+ for (TerminalMap::iterator p = m_terminalMap.begin();
+ p != m_terminalMap.end(); ++p) {
+ p->second.Prune(tableLimit);
+ }
+ if (m_gapNode) {
+ m_gapNode->Prune(tableLimit);
+ }
+
+ // Prune TargetPhraseCollections at this node.
+ for (LabelMap::iterator p = m_labelMap.begin(); p != m_labelMap.end(); ++p) {
+ p->second.Prune(true, tableLimit);
+ }
+}
+
+void RuleTrieScope3::Node::Sort(std::size_t tableLimit)
+{
+ // Recusively sort child node values.
+ for (TerminalMap::iterator p = m_terminalMap.begin();
+ p != m_terminalMap.end(); ++p) {
+ p->second.Sort(tableLimit);
+ }
+ if (m_gapNode) {
+ m_gapNode->Sort(tableLimit);
+ }
+
+ // Sort TargetPhraseCollections at this node.
+ for (LabelMap::iterator p = m_labelMap.begin(); p != m_labelMap.end(); ++p) {
+ p->second.Sort(true, tableLimit);
+ }
+}
+
+RuleTrieScope3::Node *RuleTrieScope3::Node::GetOrCreateTerminalChild(
+ const Word &sourceTerm)
+{
+ assert(!sourceTerm.IsNonTerminal());
+ std::pair<TerminalMap::iterator, bool> result;
+ result = m_terminalMap.insert(std::make_pair(sourceTerm, Node()));
+ const TerminalMap::iterator &iter = result.first;
+ Node &child = iter->second;
+ return &child;
+}
+
+RuleTrieScope3::Node *RuleTrieScope3::Node::GetOrCreateNonTerminalChild(
+ const Word &targetNonTerm)
+{
+ assert(targetNonTerm.IsNonTerminal());
+ if (m_gapNode == NULL) {
+ m_gapNode = new Node();
+ }
+ return m_gapNode;
+}
+
+TargetPhraseCollection &
+RuleTrieScope3::Node::GetOrCreateTargetPhraseCollection(
+ const TargetPhrase &target)
+{
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+ const std::size_t rank = alignmentInfo.GetSize();
+
+ std::vector<int> vec;
+ vec.reserve(rank);
+
+ m_labelTable.resize(rank);
+
+ int i = 0;
+ for (AlignmentInfo::const_iterator p = alignmentInfo.begin();
+ p != alignmentInfo.end(); ++p) {
+ std::size_t targetNonTermIndex = p->second;
+ const Word &targetNonTerm = target.GetWord(targetNonTermIndex);
+ vec.push_back(InsertLabel(i++, targetNonTerm));
+ }
+
+ return m_labelMap[vec];
+}
+
+TargetPhraseCollection &RuleTrieScope3::GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
+{
+ Node &currNode = GetOrCreateNode(source, target, sourceLHS);
+ return currNode.GetOrCreateTargetPhraseCollection(target);
+}
+
+RuleTrieScope3::Node &RuleTrieScope3::GetOrCreateNode(
+ const Phrase &source, const TargetPhrase &target, const Word */*sourceLHS*/)
+{
+ const std::size_t size = source.GetSize();
+
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+ AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
+
+ Node *currNode = &m_root;
+ for (std::size_t pos = 0 ; pos < size ; ++pos) {
+ const Word &word = source.GetWord(pos);
+
+ if (word.IsNonTerminal()) {
+ assert(iterAlign != alignmentInfo.end());
+ assert(iterAlign->first == pos);
+ std::size_t targetNonTermInd = iterAlign->second;
+ ++iterAlign;
+ const Word &targetNonTerm = target.GetWord(targetNonTermInd);
+ currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
+ } else {
+ currNode = currNode->GetOrCreateTerminalChild(word);
+ }
+
+ assert(currNode != NULL);
+ }
+
+ return *currNode;
+}
+
+void RuleTrieScope3::SortAndPrune(std::size_t tableLimit)
+{
+ if (tableLimit) {
+ m_root.Sort(tableLimit);
+ }
+}
+
+bool RuleTrieScope3::HasPreterminalRule(const Word &w) const
+{
+ const Node::TerminalMap &map = m_root.GetTerminalMap();
+ Node::TerminalMap::const_iterator p = map.find(w);
+ return p != map.end() && p->second.HasRules();
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/RuleTrieScope3.h b/moses/Syntax/S2T/RuleTrieScope3.h
new file mode 100644
index 000000000..6dd38a4f1
--- /dev/null
+++ b/moses/Syntax/S2T/RuleTrieScope3.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+
+#include "RuleTrie.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class RuleTrieScope3 : public RuleTrie
+{
+ public:
+ class Node
+ {
+ public:
+ typedef std::vector<std::vector<Word> > LabelTable;
+
+ typedef boost::unordered_map<Word, Node, SymbolHasher,
+ SymbolEqualityPred> TerminalMap;
+
+ typedef boost::unordered_map<std::vector<int>,
+ TargetPhraseCollection> LabelMap;
+
+ ~Node() { delete m_gapNode; }
+
+ const LabelTable &GetLabelTable() const { return m_labelTable; }
+
+ const LabelMap &GetLabelMap() const { return m_labelMap; }
+
+ const TerminalMap &GetTerminalMap() const { return m_terminalMap; }
+
+ const Node *GetNonTerminalChild() const { return m_gapNode; }
+
+ Node *GetOrCreateTerminalChild(const Word &sourceTerm);
+
+ Node *GetOrCreateNonTerminalChild(const Word &targetNonTerm);
+
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ const TargetPhrase &);
+
+ bool IsLeaf() const { return m_terminalMap.empty() && m_gapNode == NULL; }
+
+ bool HasRules() const { return !m_labelMap.empty(); }
+
+ void Prune(std::size_t tableLimit);
+ void Sort(std::size_t tableLimit);
+
+ private:
+ friend class RuleTrieScope3;
+
+ Node() : m_gapNode(NULL) {}
+
+ int InsertLabel(int i, const Word &w) {
+ std::vector<Word> &inner = m_labelTable[i];
+ for (std::size_t j = 0; j < inner.size(); ++j) {
+ if (inner[j] == w) {
+ return j;
+ }
+ }
+ inner.push_back(w);
+ return inner.size()-1;
+ }
+
+ LabelTable m_labelTable;
+ LabelMap m_labelMap;
+ TerminalMap m_terminalMap;
+ Node *m_gapNode;
+ };
+
+ RuleTrieScope3(const RuleTableFF *ff) : RuleTrie(ff) {}
+
+ const Node &GetRootNode() const { return m_root; }
+
+ bool HasPreterminalRule(const Word &) const;
+
+ private:
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
+
+ Node &GetOrCreateNode(const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS);
+
+ void SortAndPrune(std::size_t);
+
+ Node m_root;
+};
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/SChart.cpp b/moses/Syntax/S2T/SChart.cpp
new file mode 100644
index 000000000..f47d6efdb
--- /dev/null
+++ b/moses/Syntax/S2T/SChart.cpp
@@ -0,0 +1,20 @@
+#include "SChart.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+SChart::SChart(std::size_t width)
+{
+ m_cells.resize(width);
+ for (std::size_t i = 0; i < width; ++i) {
+ m_cells[i].resize(width);
+ }
+}
+
+} // namespace S2T
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/S2T/SChart.h b/moses/Syntax/S2T/SChart.h
new file mode 100644
index 000000000..62b7d0c2b
--- /dev/null
+++ b/moses/Syntax/S2T/SChart.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "moses/Syntax/NonTerminalMap.h"
+#include "moses/Syntax/SVertexStack.h"
+#include "moses/Syntax/SymbolEqualityPred.h"
+#include "moses/Syntax/SymbolHasher.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+namespace S2T
+{
+
+class SChart
+{
+ public:
+ struct Cell
+ {
+ typedef boost::unordered_map<Word, SVertexStack, SymbolHasher,
+ SymbolEqualityPred> TMap;
+ typedef NonTerminalMap<SVertexStack> NMap;
+ TMap terminalStacks;
+ NMap nonTerminalStacks;
+ };
+
+ SChart(std::size_t width);
+
+ std::size_t GetWidth() const { return m_cells.size(); }
+
+ const Cell &GetCell(std::size_t start, std::size_t end) const {
+ return m_cells[start][end];
+ }
+
+ Cell &GetCell(std::size_t start, std::size_t end) {
+ return m_cells[start][end];
+ }
+
+ private:
+ std::vector<std::vector<Cell> > m_cells;
+};
+
+} // S2T
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SHyperedge.cpp b/moses/Syntax/SHyperedge.cpp
new file mode 100644
index 000000000..0f098c7a4
--- /dev/null
+++ b/moses/Syntax/SHyperedge.cpp
@@ -0,0 +1,59 @@
+#include "SHyperedge.h"
+
+#include "moses/StaticData.h"
+
+#include "SVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+Phrase GetOneBestTargetYield(const SHyperedge &h)
+{
+ FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
+
+ Phrase ret(ARRAY_SIZE_INCR);
+
+ const AlignmentInfo::NonTermIndexMap &targetToSourceMap =
+ h.translation->GetAlignNonTerm().GetNonTermIndexMap2();
+
+ for (std::size_t pos = 0; pos < h.translation->GetSize(); ++pos) {
+ const Word &word = h.translation->GetWord(pos);
+ if (word.IsNonTerminal()) {
+ std::size_t sourceIndex = targetToSourceMap[pos];
+ const SHyperedge &incoming = *h.tail[sourceIndex]->best;
+ Phrase subPhrase = GetOneBestTargetYield(incoming);
+ ret.Append(subPhrase);
+ } else {
+ ret.AddWord(word);
+ if (placeholderFactor == NOT_FOUND) {
+ continue;
+ }
+ assert(false);
+ // FIXME Modify this chunk of code to work for SHyperedge.
+/*
+ std::set<std::size_t> sourcePosSet =
+ h.translation->GetAlignTerm().GetAlignmentsForTarget(pos);
+ if (sourcePosSet.size() == 1) {
+ const std::vector<const Word*> *ruleSourceFromInputPath =
+ hypo.GetTranslationOption().GetSourceRuleFromInputPath();
+ UTIL_THROW_IF2(ruleSourceFromInputPath == NULL,
+ "Source Words in of the rules hasn't been filled out");
+ std::size_t sourcePos = *sourcePosSet.begin();
+ const Word *sourceWord = ruleSourceFromInputPath->at(sourcePos);
+ UTIL_THROW_IF2(sourceWord == NULL,
+ "Null source word at position " << sourcePos);
+ const Factor *factor = sourceWord->GetFactor(placeholderFactor);
+ if (factor) {
+ ret.Back()[0] = factor;
+ }
+ }
+*/
+ }
+ }
+ return ret;
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SHyperedge.h b/moses/Syntax/SHyperedge.h
new file mode 100644
index 000000000..6d9128d49
--- /dev/null
+++ b/moses/Syntax/SHyperedge.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/Phrase.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/TargetPhrase.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct SVertex;
+
+struct SHyperedge
+{
+ SVertex *head;
+ std::vector<SVertex*> tail;
+ float score;
+ ScoreComponentCollection scoreBreakdown;
+ const TargetPhrase *translation;
+};
+
+Phrase GetOneBestTargetYield(const SHyperedge &h);
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SHyperedgeBundle.h b/moses/Syntax/SHyperedgeBundle.h
new file mode 100644
index 000000000..4a78c5458
--- /dev/null
+++ b/moses/Syntax/SHyperedgeBundle.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/ScoreComponentCollection.h"
+#include "moses/TargetPhraseCollection.h"
+
+#include "SVertexStack.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct PVertex;
+
+struct SHyperedgeBundle
+{
+ std::vector<const SVertexStack*> stacks;
+ const TargetPhraseCollection *translations;
+
+ friend void swap(SHyperedgeBundle &x, SHyperedgeBundle &y) {
+ using std::swap;
+ swap(x.stacks, y.stacks);
+ swap(x.translations, y.translations);
+ }
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SHyperedgeBundleScorer.h b/moses/Syntax/SHyperedgeBundleScorer.h
new file mode 100644
index 000000000..3bf547cfd
--- /dev/null
+++ b/moses/Syntax/SHyperedgeBundleScorer.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "SHyperedgeBundle.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct SHyperedgeBundleScorer
+{
+ public:
+ static float Score(const SHyperedgeBundle &bundle) {
+ const TargetPhrase &targetPhrase = **(bundle.translations->begin());
+ float score = targetPhrase.GetFutureScore();
+ for (std::vector<const SVertexStack*>::const_iterator p =
+ bundle.stacks.begin(); p != bundle.stacks.end(); ++p) {
+ const SVertexStack *stack = *p;
+ if (stack->front()->best) {
+ score += stack->front()->best->score;
+ }
+ }
+ return score;
+ }
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SVertex.cpp b/moses/Syntax/SVertex.cpp
new file mode 100644
index 000000000..32650b2a8
--- /dev/null
+++ b/moses/Syntax/SVertex.cpp
@@ -0,0 +1,28 @@
+#include "SVertex.h"
+
+#include "moses/FF/FFState.h"
+
+#include "SHyperedge.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+SVertex::~SVertex()
+{
+ // Delete incoming SHyperedge objects.
+ delete best;
+ for (std::vector<SHyperedge*>::iterator p = recombined.begin();
+ p != recombined.end(); ++p) {
+ delete *p;
+ }
+ // Delete FFState objects.
+ for (std::vector<FFState*>::iterator p = state.begin();
+ p != state.end(); ++p) {
+ delete *p;
+ }
+}
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SVertex.h b/moses/Syntax/SVertex.h
new file mode 100644
index 000000000..cde14c21a
--- /dev/null
+++ b/moses/Syntax/SVertex.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <vector>
+
+namespace Moses
+{
+
+class FFState;
+
+namespace Syntax
+{
+
+struct PVertex;
+struct SHyperedge;
+
+// A vertex in the search hypergraph.
+//
+// Important: a SVertex owns its incoming SHyperedge objects and its FFState
+// objects and will delete them on destruction.
+struct SVertex
+{
+ ~SVertex();
+
+ SHyperedge *best;
+ std::vector<SHyperedge*> recombined;
+ const PVertex *pvertex;
+ std::vector<FFState*> state;
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SVertexRecombinationOrderer.h b/moses/Syntax/SVertexRecombinationOrderer.h
new file mode 100644
index 000000000..60686d989
--- /dev/null
+++ b/moses/Syntax/SVertexRecombinationOrderer.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "moses/FF/FFState.h"
+
+#include "SVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+struct SVertexRecombinationOrderer
+{
+ public:
+ bool operator()(const SVertex &x, const SVertex &y) const
+ {
+ int comp = 0;
+ for (std::size_t i = 0; i < x.state.size(); ++i) {
+ if (x.state[i] == NULL || y.state[i] == NULL) {
+ comp = x.state[i] - y.state[i];
+ } else {
+ comp = x.state[i]->Compare(*y.state[i]);
+ }
+ if (comp != 0) {
+ return comp < 0;
+ }
+ }
+ return false;
+ }
+
+ bool operator()(const SVertex *x, const SVertex *y) const
+ {
+ return operator()(*x, *y);
+ }
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SVertexStack.h b/moses/Syntax/SVertexStack.h
new file mode 100644
index 000000000..57dc9f247
--- /dev/null
+++ b/moses/Syntax/SVertexStack.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+
+#include "SHyperedge.h"
+#include "SVertex.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+typedef std::vector<boost::shared_ptr<SVertex> > SVertexStack;
+
+struct SVertexStackContentOrderer
+{
+ public:
+ bool operator()(const boost::shared_ptr<SVertex> &x,
+ const boost::shared_ptr<SVertex> &y)
+ {
+ return x->best->score > y->best->score;
+ }
+};
+
+} // Syntax
+} // Moses
diff --git a/moses/Syntax/SymbolEqualityPred.h b/moses/Syntax/SymbolEqualityPred.h
new file mode 100644
index 000000000..e97c4f11b
--- /dev/null
+++ b/moses/Syntax/SymbolEqualityPred.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "moses/Factor.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// Assumes that only the first factor is relevant. i.e. factored decoding will
+// *not* work in moses_chart unless this is changed (among other things).
+class SymbolEqualityPred
+{
+ public:
+ bool operator()(const Word &s1, const Word &s2) const {
+ const Factor *f1 = s1[0];
+ const Factor *f2 = s2[0];
+ return !(f1->Compare(*f2));
+ }
+};
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/Syntax/SymbolHasher.h b/moses/Syntax/SymbolHasher.h
new file mode 100644
index 000000000..b398fdd00
--- /dev/null
+++ b/moses/Syntax/SymbolHasher.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <boost/functional/hash.hpp>
+
+#include "moses/Factor.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+namespace Syntax
+{
+
+// Assumes that only the first factor is relevant. i.e. factored decoding will
+// *not* work in moses_chart unless this is changed (among other things).
+class SymbolHasher
+{
+ public:
+ std::size_t operator()(const Word &s) const {
+ const Factor *f = s[0];
+ return hash_value(*f);
+ }
+};
+
+} // namespace Syntax
+} // namespace Moses
diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp
index 5afb53ecc..6bdae40ed 100644
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@@ -38,7 +38,7 @@ using namespace std;
namespace Moses
{
-TargetPhrase::TargetPhrase( std::string out_string)
+TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt)
:Phrase(0)
, m_fullScore(0.0)
, m_futureScore(0.0)
@@ -46,6 +46,7 @@ TargetPhrase::TargetPhrase( std::string out_string)
, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
, m_lhsTarget(NULL)
, m_ruleSource(NULL)
+ , m_container(pt)
{
//ACAT
@@ -55,7 +56,7 @@ TargetPhrase::TargetPhrase( std::string out_string)
NULL);
}
-TargetPhrase::TargetPhrase()
+TargetPhrase::TargetPhrase(const PhraseDictionary *pt)
:Phrase()
, m_fullScore(0.0)
, m_futureScore(0.0)
@@ -63,10 +64,11 @@ TargetPhrase::TargetPhrase()
, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
, m_lhsTarget(NULL)
, m_ruleSource(NULL)
+ , m_container(pt)
{
}
-TargetPhrase::TargetPhrase(const Phrase &phrase)
+TargetPhrase::TargetPhrase(const Phrase &phrase, const PhraseDictionary *pt)
: Phrase(phrase)
, m_fullScore(0.0)
, m_futureScore(0.0)
@@ -74,6 +76,7 @@ TargetPhrase::TargetPhrase(const Phrase &phrase)
, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
, m_lhsTarget(NULL)
, m_ruleSource(NULL)
+ , m_container(pt)
{
}
@@ -84,6 +87,7 @@ TargetPhrase::TargetPhrase(const TargetPhrase &copy)
, m_scoreBreakdown(copy.m_scoreBreakdown)
, m_alignTerm(copy.m_alignTerm)
, m_alignNonTerm(copy.m_alignNonTerm)
+ , m_container(copy.m_container)
{
if (copy.m_lhsTarget) {
m_lhsTarget = new Word(*copy.m_lhsTarget);
@@ -115,13 +119,13 @@ void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const
}
#endif
-void TargetPhrase::Evaluate(const Phrase &source)
+void TargetPhrase::EvaluateInIsolation(const Phrase &source)
{
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
- Evaluate(source, ffs);
+ EvaluateInIsolation(source, ffs);
}
-void TargetPhrase::Evaluate(const Phrase &source, const std::vector<FeatureFunction*> &ffs)
+void TargetPhrase::EvaluateInIsolation(const Phrase &source, const std::vector<FeatureFunction*> &ffs)
{
if (ffs.size()) {
const StaticData &staticData = StaticData::Instance();
@@ -129,7 +133,7 @@ void TargetPhrase::Evaluate(const Phrase &source, const std::vector<FeatureFunct
for (size_t i = 0; i < ffs.size(); ++i) {
const FeatureFunction &ff = *ffs[i];
if (! staticData.IsFeatureFunctionIgnored( ff )) {
- ff.Evaluate(source, *this, m_scoreBreakdown, futureScoreBreakdown);
+ ff.EvaluateInIsolation(source, *this, m_scoreBreakdown, futureScoreBreakdown);
}
}
@@ -139,7 +143,7 @@ void TargetPhrase::Evaluate(const Phrase &source, const std::vector<FeatureFunct
}
}
-void TargetPhrase::Evaluate(const InputType &input, const InputPath &inputPath)
+void TargetPhrase::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
{
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
const StaticData &staticData = StaticData::Instance();
@@ -147,7 +151,7 @@ void TargetPhrase::Evaluate(const InputType &input, const InputPath &inputPath)
for (size_t i = 0; i < ffs.size(); ++i) {
const FeatureFunction &ff = *ffs[i];
if (! staticData.IsFeatureFunctionIgnored( ff )) {
- ff.Evaluate(input, inputPath, *this, NULL, m_scoreBreakdown, &futureScoreBreakdown);
+ ff.EvaluateWithSourceContext(input, inputPath, *this, NULL, m_scoreBreakdown, &futureScoreBreakdown);
}
}
float weightedScore = m_scoreBreakdown.GetWeightedScore();
@@ -189,18 +193,18 @@ void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
// cerr << "TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) this:|" << *this << "|\n";
}
-void TargetPhrase::SetAlignTerm(const AlignmentInfo::CollType &coll)
-{
- const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
- m_alignTerm = alignmentInfo;
+// void TargetPhrase::SetAlignTerm(const AlignmentInfo::CollType &coll)
+// {
+// const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
+// m_alignTerm = alignmentInfo;
-}
+// }
-void TargetPhrase::SetAlignNonTerm(const AlignmentInfo::CollType &coll)
-{
- const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
- m_alignNonTerm = alignmentInfo;
-}
+// void TargetPhrase::SetAlignNonTerm(const AlignmentInfo::CollType &coll)
+// {
+// const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
+// m_alignNonTerm = alignmentInfo;
+// }
void TargetPhrase::SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString)
{
@@ -246,15 +250,15 @@ void TargetPhrase::SetProperty(const std::string &key, const std::string &value)
m_properties[key] = phrasePropertyFactory.ProduceProperty(key,value);
}
-bool TargetPhrase::GetProperty(const std::string &key, boost::shared_ptr<PhraseProperty> &value) const
+const PhraseProperty *TargetPhrase::GetProperty(const std::string &key) const
{
std::map<std::string, boost::shared_ptr<PhraseProperty> >::const_iterator iter;
iter = m_properties.find(key);
if (iter != m_properties.end()) {
- value = iter->second;
- return true;
+ const boost::shared_ptr<PhraseProperty> &pp = iter->second;
+ return pp.get();
}
- return false;
+ return NULL;
}
void TargetPhrase::SetRuleSource(const Phrase &ruleSource) const
@@ -284,17 +288,30 @@ std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp)
}
os << static_cast<const Phrase&>(tp) << ":" << flush;
- // os << tp.GetAlignNonTerm() << flush;
+ os << tp.GetAlignNonTerm() << flush;
os << ": term=" << tp.GetAlignTerm() << flush;
os << ": nonterm=" << tp.GetAlignNonTerm() << flush;
os << ": c=" << tp.m_fullScore << flush;
os << " " << tp.m_scoreBreakdown << flush;
-
+
const Phrase *sourcePhrase = tp.GetRuleSource();
if (sourcePhrase) {
os << " sourcePhrase=" << *sourcePhrase << flush;
}
+ if (tp.m_properties.size()) {
+ os << " properties: " << flush;
+
+ TargetPhrase::Properties::const_iterator iter;
+ for (iter = tp.m_properties.begin(); iter != tp.m_properties.end(); ++iter) {
+ const string &key = iter->first;
+ const PhraseProperty *prop = iter->second.get();
+ assert(prop);
+
+ os << key << "=" << *prop << " ";
+ }
+ }
+
return os;
}
diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h
index 10435857f..fbad03678 100644
--- a/moses/TargetPhrase.h
+++ b/moses/TargetPhrase.h
@@ -28,6 +28,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Phrase.h"
#include "ScoreComponentCollection.h"
#include "AlignmentInfo.h"
+#include "AlignmentInfoCollection.h"
#include "moses/PP/PhraseProperty.h"
#include "util/string_piece.hh"
@@ -41,6 +42,8 @@ namespace Moses
{
class FeatureFunction;
class InputPath;
+class InputPath;
+class PhraseDictionary;
/** represents an entry on the target side of a phrase table (scores, translation, alignment)
*/
@@ -57,24 +60,27 @@ private:
const Word *m_lhsTarget;
mutable Phrase *m_ruleSource; // to be set by the feature function that needs it.
- std::map<std::string, boost::shared_ptr<PhraseProperty> > m_properties;
+ typedef std::map<std::string, boost::shared_ptr<PhraseProperty> > Properties;
+ Properties m_properties;
+
+ const PhraseDictionary *m_container;
public:
- TargetPhrase();
+ TargetPhrase(const PhraseDictionary *pt = NULL);
+ TargetPhrase(std::string out_string, const PhraseDictionary *pt = NULL);
TargetPhrase(const TargetPhrase &copy);
- explicit TargetPhrase(std::string out_string);
- explicit TargetPhrase(const Phrase &targetPhrase);
+ explicit TargetPhrase(const Phrase &targetPhrase, const PhraseDictionary *pt);
~TargetPhrase();
// 1st evaluate method. Called during loading of phrase table.
- void Evaluate(const Phrase &source, const std::vector<FeatureFunction*> &ffs);
+ void EvaluateInIsolation(const Phrase &source, const std::vector<FeatureFunction*> &ffs);
// as above, score with ALL FFs
// Used only for OOV processing. Doesn't have a phrase table connect with it
- void Evaluate(const Phrase &source);
+ void EvaluateInIsolation(const Phrase &source);
// 'inputPath' is guaranteed to be the raw substring from the input. No factors were added or taken away
- void Evaluate(const InputType &input, const InputPath &inputPath);
+ void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
void SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString);
@@ -126,8 +132,24 @@ public:
m_alignNonTerm = alignNonTerm;
}
- void SetAlignTerm(const AlignmentInfo::CollType &coll);
- void SetAlignNonTerm(const AlignmentInfo::CollType &coll);
+ // ALNREP = alignment representation,
+ // see AlignmentInfo constructors for supported representations
+ template<typename ALNREP>
+ void
+ SetAlignTerm(const ALNREP &coll)
+ {
+ m_alignTerm = AlignmentInfoCollection::Instance().Add(coll);
+ }
+
+ // ALNREP = alignment representation,
+ // see AlignmentInfo constructors for supported representations
+ template<typename ALNREP>
+ void
+ SetAlignNonTerm(const ALNREP &coll)
+ {
+ m_alignNonTerm = AlignmentInfoCollection::Instance().Add(coll);
+ }
+
const AlignmentInfo &GetAlignTerm() const {
return *m_alignTerm;
@@ -140,13 +162,16 @@ public:
return m_ruleSource;
}
+ const PhraseDictionary *GetContainer() const
+ { return m_container; }
+
// To be set by the FF that needs it, by default the rule source = NULL
// make a copy of the source side of the rule
void SetRuleSource(const Phrase &ruleSource) const;
void SetProperties(const StringPiece &str);
void SetProperty(const std::string &key, const std::string &value);
- bool GetProperty(const std::string &key, boost::shared_ptr<PhraseProperty> &value) const;
+ const PhraseProperty *GetProperty(const std::string &key) const;
void Merge(const TargetPhrase &copy, const std::vector<FactorType>& factorVec);
diff --git a/moses/TargetPhraseCollection.h b/moses/TargetPhraseCollection.h
index aae22d82a..0af89e833 100644
--- a/moses/TargetPhraseCollection.h
+++ b/moses/TargetPhraseCollection.h
@@ -44,6 +44,12 @@ public:
typedef CollType::iterator iterator;
typedef CollType::const_iterator const_iterator;
+ TargetPhrase const*
+ operator[](size_t const i) const
+ {
+ return m_collection.at(i);
+ }
+
iterator begin() {
return m_collection.begin();
}
diff --git a/moses/ThreadPool.cpp b/moses/ThreadPool.cpp
index 9d0cdd06b..265c150c2 100644
--- a/moses/ThreadPool.cpp
+++ b/moses/ThreadPool.cpp
@@ -55,8 +55,11 @@ void ThreadPool::Execute()
}
//Execute job
if (task) {
+ // must read from task before run. otherwise task may be deleted by main thread
+ // race condition
+ bool del = task->DeleteAfterExecution();
task->Run();
- if (task->DeleteAfterExecution()) {
+ if (del) {
delete task;
}
}
diff --git a/moses/Timer.cpp b/moses/Timer.cpp
index 5f8508ab7..6128ab885 100644
--- a/moses/Timer.cpp
+++ b/moses/Timer.cpp
@@ -2,6 +2,7 @@
#include <iomanip>
#include "Util.h"
#include "Timer.h"
+#include "StaticData.h"
#include "util/usage.hh"
@@ -30,7 +31,7 @@ double Timer::get_elapsed_time() const
void Timer::start(const char* msg)
{
// Print an optional message, something like "Starting timer t";
- if (msg) TRACE_ERR( msg << std::endl);
+ if (msg) VERBOSE(1, msg << std::endl);
// Return immediately if the timer is already running
if (running && !stopped) return;
@@ -52,7 +53,7 @@ void Timer::start(const char* msg)
void Timer::stop(const char* msg)
{
// Print an optional message, something like "Stopping timer t";
- if (msg) TRACE_ERR( msg << std::endl);
+ if (msg) VERBOSE(1, msg << std::endl);
// Return immediately if the timer is not running
if (stopped || !running) return;
@@ -70,10 +71,10 @@ void Timer::stop(const char* msg)
void Timer::check(const char* msg)
{
// Print an optional message, something like "Checking timer t";
- if (msg) TRACE_ERR( msg << " : ");
+ if (msg) VERBOSE(1, msg << " : ");
-// TRACE_ERR( "[" << std::setiosflags(std::ios::fixed) << std::setprecision(2) << (running ? elapsed_time() : 0) << "] seconds\n");
- TRACE_ERR( "[" << (running ? get_elapsed_time() : 0) << "] seconds\n");
+// VERBOSE(1, "[" << std::setiosflags(std::ios::fixed) << std::setprecision(2) << (running ? elapsed_time() : 0) << "] seconds\n");
+ VERBOSE(1, "[" << (running ? get_elapsed_time() : 0) << "] seconds\n");
}
/***
diff --git a/moses-cmd/TranslationAnalysis.cpp b/moses/TranslationAnalysis.cpp
index e77486162..ed948f6b6 100644
--- a/moses-cmd/TranslationAnalysis.cpp
+++ b/moses/TranslationAnalysis.cpp
@@ -5,6 +5,7 @@
#include <algorithm>
#include "moses/StaticData.h"
#include "moses/Hypothesis.h"
+#include "moses/ChartHypothesis.h"
#include "TranslationAnalysis.h"
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/StatelessFeatureFunction.h"
@@ -134,4 +135,33 @@ void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo)
os << std::endl;
}
+void PrintTranslationAnalysis(std::ostream &os, const Moses::ChartHypothesis* hypo)
+{
+ /*
+ os << endl << "TRANSLATION HYPOTHESIS DETAILS:" << endl;
+ queue<const Hypothesis*> translationPath;
+ while (hypo)
+ {
+ translationPath.push(hypo);
+ hypo = hypo->GetPrevHypo();
+ }
+
+ while (!translationPath.empty())
+ {
+ hypo = translationPath.front();
+ translationPath.pop();
+ const TranslationOption *transOpt = hypo->GetTranslationOption();
+ if (transOpt != NULL)
+ {
+ os << hypo->GetCurrSourceWordsRange() << " ";
+ for (size_t decodeStepId = 0; decodeStepId < DecodeStepTranslation::GetNumTransStep(); ++decodeStepId)
+ os << decodeStepId << "=" << transOpt->GetSubRangeCount(decodeStepId) << ",";
+ os << *transOpt << endl;
+ }
+ }
+
+ os << "END TRANSLATION" << endl;
+ */
+}
+
}
diff --git a/moses-cmd/TranslationAnalysis.h b/moses/TranslationAnalysis.h
index 348cfe512..ccb21f041 100644
--- a/moses-cmd/TranslationAnalysis.h
+++ b/moses/TranslationAnalysis.h
@@ -1,14 +1,16 @@
+#pragma once
// $Id$
/*
* also see moses/SentenceStats
*/
-#ifndef moses_cmd_TranslationAnalysis_h
-#define moses_cmd_TranslationAnalysis_h
-
#include <iostream>
-#include "moses/Hypothesis.h"
+
+namespace Moses {
+ class Hypothesis;
+ class ChartHypothesis;
+}
namespace TranslationAnalysis
{
@@ -18,7 +20,7 @@ namespace TranslationAnalysis
* os. Included information: phrase alignment, words dropped, scores
*/
void PrintTranslationAnalysis(std::ostream &os, const Moses::Hypothesis* hypo);
+void PrintTranslationAnalysis(std::ostream &os, const Moses::ChartHypothesis* hypo);
}
-#endif
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp
index 22fef55c6..b0607b770 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.cpp
+++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp
@@ -206,12 +206,9 @@ LoadCorpus(FactorDirection direction,
int sntIdx(0);
// corpus.seekg(0); Seems needless -> commented out to allow
// loading of gzipped corpora (gzfilebuf doesn't support seeking).
- const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
while(getline(corpus, line)) {
sntArray.push_back(sntIdx);
Phrase phrase(ARRAY_SIZE_INCR);
- // parse phrase
- // phrase.CreateFromString( direction, factors, line, factorDelimiter, NULL);
phrase.CreateFromString( direction, factors, line, NULL);
// store words in vocabulary and corpus
for( size_t i = 0; i < phrase.GetSize(); ++i) {
@@ -377,9 +374,9 @@ TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
TargetPhrase*
BilingualDynSuffixArray::
-GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const
+GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase, const PhraseDictionary *pt) const
{
- TargetPhrase* targetPhrase = new TargetPhrase();
+ TargetPhrase* targetPhrase = new TargetPhrase(pt);
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
Word& word = m_trgVocab->GetWord( phrase.words[i]);
UTIL_THROW_IF2(word == m_trgVocab->GetkOOVWord(),
@@ -499,11 +496,9 @@ addSntPair(string& source, string& target, string& alignment)
vuint_t srcFactor, trgFactor;
cerr << "source, target, alignment = " << source << ", "
<< target << ", " << alignment << endl;
- const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
Phrase sphrase(ARRAY_SIZE_INCR);
- // sphrase.CreateFromString(Input, m_inputFactors, source, factorDelimiter, NULL);
sphrase.CreateFromString(Input, m_inputFactors, source, NULL);
m_srcVocab->MakeOpen();
vector<wordID_t> sIDs(sphrase.GetSize());
@@ -519,7 +514,6 @@ addSntPair(string& source, string& target, string& alignment)
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
m_srcVocab->MakeClosed();
Phrase tphrase(ARRAY_SIZE_INCR);
- // tphrase.CreateFromString(Output, m_outputFactors, target, factorDelimiter, NULL);
tphrase.CreateFromString(Output, m_outputFactors, target, NULL);
m_trgVocab->MakeOpen();
vector<wordID_t> tIDs(tphrase.GetSize());
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h
index 48d719b7e..5b52b8814 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.h
+++ b/moses/TranslationModel/BilingualDynSuffixArray.h
@@ -128,7 +128,7 @@ public:
GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const;
TargetPhrase*
- GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
+ GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase, const PhraseDictionary *pt) const;
private:
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h
index c3672ac47..e2ba6779c 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h
@@ -50,6 +50,19 @@ protected:
StackVec m_stackVec;
};
+// struct that caches cellLabel, its end position and score for quicker lookup
+struct ChartCellCache
+{
+ ChartCellCache(size_t endPos, const ChartCellLabel* cellLabel, float score)
+ : endPos(endPos)
+ , cellLabel(cellLabel)
+ , score(score) {}
+
+ size_t endPos;
+ const ChartCellLabel* cellLabel;
+ float score;
+};
+
} // namespace Moses
#endif
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
index f5d6ebf6f..f229a0f4b 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
@@ -22,10 +22,12 @@
#include "moses/ChartParser.h"
#include "moses/InputType.h"
+#include "moses/Terminal.h"
#include "moses/ChartParserCallback.h"
#include "moses/StaticData.h"
#include "moses/NonTerminal.h"
#include "moses/ChartCellCollection.h"
+#include "moses/FactorCollection.h"
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
using namespace std;
@@ -41,6 +43,7 @@ ChartRuleLookupManagerMemory::ChartRuleLookupManagerMemory(
, m_ruleTable(ruleTable)
, m_softMatchingMap(StaticData::Instance().GetSoftMatches())
{
+
size_t sourceSize = parser.GetSize();
m_completedRules.resize(sourceSize);
@@ -58,29 +61,22 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
m_lastPos = lastPos;
m_stackVec.clear();
+ m_stackScores.clear();
m_outColl = &outColl;
m_unaryPos = absEndPos-1; // rules ending in this position are unary and should not be added to collection
+ // create/update data structure to quickly look up all chart cells that match start position and label.
+ UpdateCompressedMatrix(startPos, absEndPos, lastPos);
+
const PhraseDictionaryNodeMemory &rootNode = m_ruleTable.GetRootNode();
- // size-1 terminal rules
+ // all rules starting with terminal
if (startPos == absEndPos) {
- const Word &sourceWord = GetSourceAt(absEndPos).GetLabel();
- const PhraseDictionaryNodeMemory *child = rootNode.GetChild(sourceWord);
-
- // if we found a new rule -> directly add it to the out collection
- if (child != NULL) {
- const TargetPhraseCollection &tpc = child->GetTargetPhraseCollection();
- outColl.Add(tpc, m_stackVec, range);
- }
+ GetTerminalExtension(&rootNode, startPos);
}
// all rules starting with nonterminal
else if (absEndPos > startPos) {
- GetNonTerminalExtension(&rootNode, startPos, absEndPos-1);
- // all (non-unary) rules starting with terminal
- if (absEndPos == startPos+1) {
- GetTerminalExtension(&rootNode, absEndPos-1);
- }
+ GetNonTerminalExtension(&rootNode, startPos);
}
// copy temporarily stored rules to out collection
@@ -93,139 +89,178 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
}
-// if a (partial) rule matches, add it to list completed rules (if non-unary and non-empty), and try find expansions that have this partial rule as prefix.
-void ChartRuleLookupManagerMemory::AddAndExtend(
- const PhraseDictionaryNodeMemory *node,
- size_t endPos,
- const ChartCellLabel *cellLabel)
-{
+// Create/update compressed matrix that stores all valid ChartCellLabels for a given start position and label.
+void ChartRuleLookupManagerMemory::UpdateCompressedMatrix(size_t startPos,
+ size_t origEndPos,
+ size_t lastPos) {
- // add backpointer
- if (cellLabel != NULL) {
- m_stackVec.push_back(cellLabel);
- }
+ std::vector<size_t> endPosVec;
+ size_t numNonTerms = FactorCollection::Instance().GetNumNonTerminals();
+ m_compressedMatrixVec.resize(lastPos+1);
- const TargetPhraseCollection &tpc = node->GetTargetPhraseCollection();
- // add target phrase collection (except if rule is empty or unary)
- if (!tpc.IsEmpty() && endPos != m_unaryPos) {
- m_completedRules[endPos].Add(tpc, m_stackVec, *m_outColl);
- }
+ // we only need to update cell at [startPos, origEndPos-1] for initial lookup
+ if (startPos < origEndPos) {
+ endPosVec.push_back(origEndPos-1);
+ }
- // get all further extensions of rule (until reaching end of sentence or max-chart-span)
- if (endPos < m_lastPos) {
- if (!node->GetTerminalMap().empty()) {
- GetTerminalExtension(node, endPos+1);
+ // update all cells starting from startPos+1 for lookup of rule extensions
+ else if (startPos == origEndPos)
+ {
+ startPos++;
+ for (size_t endPos = startPos; endPos <= lastPos; endPos++) {
+ endPosVec.push_back(endPos);
+ }
+ //re-use data structure for cells with later start position, but remove chart cells that would break max-chart-span
+ for (size_t pos = startPos+1; pos <= lastPos; pos++) {
+ CompressedMatrix & cellMatrix = m_compressedMatrixVec[pos];
+ cellMatrix.resize(numNonTerms);
+ for (size_t i = 0; i < numNonTerms; i++) {
+ if (!cellMatrix[i].empty() && cellMatrix[i].back().endPos > lastPos) {
+ cellMatrix[i].pop_back();
+ }
+ }
+ }
}
- if (!node->GetNonTerminalMap().empty()) {
- for (size_t newEndPos = endPos+1; newEndPos <= m_lastPos; newEndPos++) {
- GetNonTerminalExtension(node, endPos+1, newEndPos);
- }
+
+ if (startPos > lastPos) {
+ return;
}
- }
- // remove backpointer
- if (cellLabel != NULL) {
- m_stackVec.pop_back();
- }
+ // populate compressed matrix with all chart cells that start at current start position
+ CompressedMatrix & cellMatrix = m_compressedMatrixVec[startPos];
+ cellMatrix.clear();
+ cellMatrix.resize(numNonTerms);
+ for (std::vector<size_t>::iterator p = endPosVec.begin(); p != endPosVec.end(); ++p) {
+
+ size_t endPos = *p;
+ // target non-terminal labels for the span
+ const ChartCellLabelSet &targetNonTerms = GetTargetLabelSet(startPos, endPos);
+
+ if (targetNonTerms.GetSize() == 0) {
+ continue;
+ }
+
+#if !defined(UNLABELLED_SOURCE)
+ // source non-terminal labels for the span
+ const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
+
+ // can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
+ if (inputPath.GetNonTerminalSet().size() == 0) {
+ continue;
+ }
+#endif
+
+ for (size_t i = 0; i < numNonTerms; i++) {
+ const ChartCellLabel *cellLabel = targetNonTerms.Find(i);
+ if (cellLabel != NULL) {
+ float score = cellLabel->GetBestScore(m_outColl);
+ cellMatrix[i].push_back(ChartCellCache(endPos, cellLabel, score));
+ }
+ }
+ }
+}
+
+// if a (partial) rule matches, add it to list completed rules (if non-unary and non-empty), and try find expansions that have this partial rule as prefix.
+void ChartRuleLookupManagerMemory::AddAndExtend(
+ const PhraseDictionaryNodeMemory *node,
+ size_t endPos) {
+
+ const TargetPhraseCollection &tpc = node->GetTargetPhraseCollection();
+ // add target phrase collection (except if rule is empty or a unary non-terminal rule)
+ if (!tpc.IsEmpty() && (m_stackVec.empty() || endPos != m_unaryPos)) {
+ m_completedRules[endPos].Add(tpc, m_stackVec, m_stackScores, *m_outColl);
+ }
+
+ // get all further extensions of rule (until reaching end of sentence or max-chart-span)
+ if (endPos < m_lastPos) {
+ if (!node->GetTerminalMap().empty()) {
+ GetTerminalExtension(node, endPos+1);
+ }
+ if (!node->GetNonTerminalMap().empty()) {
+ GetNonTerminalExtension(node, endPos+1);
+ }
+ }
}
+
// search all possible terminal extensions of a partial rule (pointed at by node) at a given position
// recursively try to expand partial rules into full rules up to m_lastPos.
void ChartRuleLookupManagerMemory::GetTerminalExtension(
- const PhraseDictionaryNodeMemory *node,
- size_t pos)
-{
-
- const Word &sourceWord = GetSourceAt(pos).GetLabel();
- const PhraseDictionaryNodeMemory::TerminalMap & terminals = node->GetTerminalMap();
-
- // if node has small number of terminal edges, test word equality for each.
- if (terminals.size() < 5) {
- for (PhraseDictionaryNodeMemory::TerminalMap::const_iterator iter = terminals.begin(); iter != terminals.end(); ++iter) {
- const Word & word = iter->first;
- if (word == sourceWord) {
- const PhraseDictionaryNodeMemory *child = & iter->second;
- AddAndExtend(child, pos, NULL);
+ const PhraseDictionaryNodeMemory *node,
+ size_t pos) {
+
+ const Word &sourceWord = GetSourceAt(pos).GetLabel();
+ const PhraseDictionaryNodeMemory::TerminalMap & terminals = node->GetTerminalMap();
+
+ // if node has small number of terminal edges, test word equality for each.
+ if (terminals.size() < 5) {
+ for (PhraseDictionaryNodeMemory::TerminalMap::const_iterator iter = terminals.begin(); iter != terminals.end(); ++iter) {
+ const Word & word = iter->first;
+ if (TerminalEqualityPred()(word, sourceWord)) {
+ const PhraseDictionaryNodeMemory *child = & iter->second;
+ AddAndExtend(child, pos);
+ break;
+ }
}
}
- }
- // else, do hash lookup
- else {
- const PhraseDictionaryNodeMemory *child = node->GetChild(sourceWord);
- if (child != NULL) {
- AddAndExtend(child, pos, NULL);
+ // else, do hash lookup
+ else {
+ const PhraseDictionaryNodeMemory *child = node->GetChild(sourceWord);
+ if (child != NULL) {
+ AddAndExtend(child, pos);
+ }
}
- }
}
-// search all nonterminal possible nonterminal extensions of a partial rule (pointed at by node) for a given span (StartPos, endPos).
+// search all nonterminal possible nonterminal extensions of a partial rule (pointed at by node) for a variable span (starting from startPos).
// recursively try to expand partial rules into full rules up to m_lastPos.
void ChartRuleLookupManagerMemory::GetNonTerminalExtension(
- const PhraseDictionaryNodeMemory *node,
- size_t startPos,
- size_t endPos)
-{
-
- // target non-terminal labels for the span
- const ChartCellLabelSet &targetNonTerms = GetTargetLabelSet(startPos, endPos);
+ const PhraseDictionaryNodeMemory *node,
+ size_t startPos) {
- if (targetNonTerms.GetSize() == 0) {
- return;
- }
+ const CompressedMatrix &compressedMatrix = m_compressedMatrixVec[startPos];
-#if !defined(UNLABELLED_SOURCE)
- // source non-terminal labels for the span
- const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
- const std::vector<bool> &sourceNonTermArray = inputPath.GetNonTerminalArray();
+ // non-terminal labels in phrase dictionary node
+ const PhraseDictionaryNodeMemory::NonTerminalMap & nonTermMap = node->GetNonTerminalMap();
- // can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
- if (inputPath.GetNonTerminalSet().size() == 0) {
- return;
- }
-#endif
+ // make room for back pointer
+ m_stackVec.push_back(NULL);
+ m_stackScores.push_back(0);
- // non-terminal labels in phrase dictionary node
- const PhraseDictionaryNodeMemory::NonTerminalMap & nonTermMap = node->GetNonTerminalMap();
-
- // loop over possible expansions of the rule
- PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
- PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = nonTermMap.end();
- for (p = nonTermMap.begin(); p != end; ++p) {
- // does it match possible source and target non-terminals?
+ // loop over possible expansions of the rule
+ PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
+ PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = nonTermMap.end();
+ for (p = nonTermMap.begin(); p != end; ++p) {
+ // does it match possible source and target non-terminals?
#if defined(UNLABELLED_SOURCE)
- const Word &targetNonTerm = p->first;
+ const Word &targetNonTerm = p->first;
#else
- const PhraseDictionaryNodeMemory::NonTerminalMapKey &key = p->first;
- const Word &sourceNonTerm = key.first;
- // check if source label matches
- if (! sourceNonTermArray[sourceNonTerm[0]->GetId()]) {
- continue;
- }
- const Word &targetNonTerm = key.second;
+ const Word &targetNonTerm = p->first.second;
#endif
- //soft matching of NTs
- if (m_isSoftMatching && !m_softMatchingMap[targetNonTerm[0]->GetId()].empty()) {
- const std::vector<Word>& softMatches = m_softMatchingMap[targetNonTerm[0]->GetId()];
- for (std::vector<Word>::const_iterator softMatch = softMatches.begin(); softMatch != softMatches.end(); ++softMatch) {
- const ChartCellLabel *cellLabel = targetNonTerms.Find(*softMatch);
- if (cellLabel == NULL) {
- continue;
+ const PhraseDictionaryNodeMemory *child = &p->second;
+ //soft matching of NTs
+ if (m_isSoftMatching && !m_softMatchingMap[targetNonTerm[0]->GetId()].empty()) {
+ const std::vector<Word>& softMatches = m_softMatchingMap[targetNonTerm[0]->GetId()];
+ for (std::vector<Word>::const_iterator softMatch = softMatches.begin(); softMatch != softMatches.end(); ++softMatch) {
+ const CompressedColumn &matches = compressedMatrix[(*softMatch)[0]->GetId()];
+ for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
+ m_stackVec.back() = match->cellLabel;
+ m_stackScores.back() = match->score;
+ AddAndExtend(child, match->endPos);
+ }
}
- // create new rule
- const PhraseDictionaryNodeMemory &child = p->second;
- AddAndExtend(&child, endPos, cellLabel);
- }
- } // end of soft matches lookup
+ } // end of soft matches lookup
- const ChartCellLabel *cellLabel = targetNonTerms.Find(targetNonTerm);
- if (cellLabel == NULL) {
- continue;
+ const CompressedColumn &matches = compressedMatrix[targetNonTerm[0]->GetId()];
+ for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
+ m_stackVec.back() = match->cellLabel;
+ m_stackScores.back() = match->score;
+ AddAndExtend(child, match->endPos);
+ }
}
- // create new rule
- const PhraseDictionaryNodeMemory &child = p->second;
- AddAndExtend(&child, endPos, cellLabel);
- }
+ // remove last back pointer
+ m_stackVec.pop_back();
+ m_stackScores.pop_back();
}
-
} // namespace Moses
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
index 767891e1b..80b6f7246 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
@@ -40,6 +40,10 @@ class WordsRange;
class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus
{
public:
+ typedef std::vector<ChartCellCache> CompressedColumn;
+ typedef std::vector<CompressedColumn> CompressedMatrix;
+
+
ChartRuleLookupManagerMemory(const ChartParser &parser,
const ChartCellCollectionBase &cellColl,
const PhraseDictionaryMemory &ruleTable);
@@ -59,13 +63,15 @@ private:
void GetNonTerminalExtension(
const PhraseDictionaryNodeMemory *node,
- size_t startPos,
- size_t endPos);
+ size_t startPos);
void AddAndExtend(
const PhraseDictionaryNodeMemory *node,
+ size_t endPos);
+
+ void UpdateCompressedMatrix(size_t startPos,
size_t endPos,
- const ChartCellLabel *cellLabel);
+ size_t lastPos);
const PhraseDictionaryMemory &m_ruleTable;
@@ -80,8 +86,13 @@ private:
size_t m_unaryPos;
StackVec m_stackVec;
+ std::vector<float> m_stackScores;
+ std::vector<const Word*> m_sourceWords;
ChartParserCallback* m_outColl;
+ std::vector<CompressedMatrix> m_compressedMatrixVec;
+
+
};
} // namespace Moses
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
index f6e7ee188..23d357f10 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
@@ -22,10 +22,12 @@
#include "moses/ChartParser.h"
#include "moses/InputType.h"
+#include "moses/Terminal.h"
#include "moses/ChartParserCallback.h"
#include "moses/StaticData.h"
#include "moses/NonTerminal.h"
#include "moses/ChartCellCollection.h"
+#include "moses/FactorCollection.h"
#include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h"
using namespace std;
@@ -59,29 +61,22 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
m_lastPos = lastPos;
m_stackVec.clear();
+ m_stackScores.clear();
m_outColl = &outColl;
m_unaryPos = absEndPos-1; // rules ending in this position are unary and should not be added to collection
+ // create/update data structure to quickly look up all chart cells that match start position and label.
+ UpdateCompressedMatrix(startPos, absEndPos, lastPos);
+
const PhraseDictionaryNodeMemory &rootNode = m_ruleTable.GetRootNode(GetParser().GetTranslationId());
- // size-1 terminal rules
+ // all rules starting with terminal
if (startPos == absEndPos) {
- const Word &sourceWord = GetSourceAt(absEndPos).GetLabel();
- const PhraseDictionaryNodeMemory *child = rootNode.GetChild(sourceWord);
-
- // if we found a new rule -> directly add it to the out collection
- if (child != NULL) {
- const TargetPhraseCollection &tpc = child->GetTargetPhraseCollection();
- outColl.Add(tpc, m_stackVec, range);
- }
+ GetTerminalExtension(&rootNode, startPos);
}
// all rules starting with nonterminal
else if (absEndPos > startPos) {
- GetNonTerminalExtension(&rootNode, startPos, absEndPos-1);
- // all (non-unary) rules starting with terminal
- if (absEndPos == startPos+1) {
- GetTerminalExtension(&rootNode, absEndPos-1);
- }
+ GetNonTerminalExtension(&rootNode, startPos);
}
// copy temporarily stored rules to out collection
@@ -94,140 +89,178 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
}
-// if a (partial) rule matches, add it to list completed rules (if non-unary and non-empty), and try find expansions that have this partial rule as prefix.
-void ChartRuleLookupManagerMemoryPerSentence::AddAndExtend(
- const PhraseDictionaryNodeMemory *node,
- size_t endPos,
- const ChartCellLabel *cellLabel)
-{
+// Create/update compressed matrix that stores all valid ChartCellLabels for a given start position and label.
+void ChartRuleLookupManagerMemoryPerSentence::UpdateCompressedMatrix(size_t startPos,
+ size_t origEndPos,
+ size_t lastPos) {
- // add backpointer
- if (cellLabel != NULL) {
- m_stackVec.push_back(cellLabel);
- }
+ std::vector<size_t> endPosVec;
+ size_t numNonTerms = FactorCollection::Instance().GetNumNonTerminals();
+ m_compressedMatrixVec.resize(lastPos+1);
- const TargetPhraseCollection &tpc = node->GetTargetPhraseCollection();
- // add target phrase collection (except if rule is empty or unary)
- if (!tpc.IsEmpty() && endPos != m_unaryPos) {
- m_completedRules[endPos].Add(tpc, m_stackVec, *m_outColl);
- }
+ // we only need to update cell at [startPos, origEndPos-1] for initial lookup
+ if (startPos < origEndPos) {
+ endPosVec.push_back(origEndPos-1);
+ }
- // get all further extensions of rule (until reaching end of sentence or max-chart-span)
- if (endPos < m_lastPos) {
- if (!node->GetTerminalMap().empty()) {
- GetTerminalExtension(node, endPos+1);
+ // update all cells starting from startPos+1 for lookup of rule extensions
+ else if (startPos == origEndPos)
+ {
+ startPos++;
+ for (size_t endPos = startPos; endPos <= lastPos; endPos++) {
+ endPosVec.push_back(endPos);
+ }
+ //re-use data structure for cells with later start position, but remove chart cells that would break max-chart-span
+ for (size_t pos = startPos+1; pos <= lastPos; pos++) {
+ CompressedMatrix & cellMatrix = m_compressedMatrixVec[pos];
+ cellMatrix.resize(numNonTerms);
+ for (size_t i = 0; i < numNonTerms; i++) {
+ if (!cellMatrix[i].empty() && cellMatrix[i].back().endPos > lastPos) {
+ cellMatrix[i].pop_back();
+ }
+ }
+ }
}
- if (!node->GetNonTerminalMap().empty()) {
- for (size_t newEndPos = endPos+1; newEndPos <= m_lastPos; newEndPos++) {
- GetNonTerminalExtension(node, endPos+1, newEndPos);
- }
+
+ if (startPos > lastPos) {
+ return;
}
- }
- // remove backpointer
- if (cellLabel != NULL) {
- m_stackVec.pop_back();
- }
+ // populate compressed matrix with all chart cells that start at current start position
+ CompressedMatrix & cellMatrix = m_compressedMatrixVec[startPos];
+ cellMatrix.clear();
+ cellMatrix.resize(numNonTerms);
+ for (std::vector<size_t>::iterator p = endPosVec.begin(); p != endPosVec.end(); ++p) {
+
+ size_t endPos = *p;
+ // target non-terminal labels for the span
+ const ChartCellLabelSet &targetNonTerms = GetTargetLabelSet(startPos, endPos);
+
+ if (targetNonTerms.GetSize() == 0) {
+ continue;
+ }
+
+#if !defined(UNLABELLED_SOURCE)
+ // source non-terminal labels for the span
+ const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
+
+ // can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
+ if (inputPath.GetNonTerminalSet().size() == 0) {
+ continue;
+ }
+#endif
+
+ for (size_t i = 0; i < numNonTerms; i++) {
+ const ChartCellLabel *cellLabel = targetNonTerms.Find(i);
+ if (cellLabel != NULL) {
+ float score = cellLabel->GetBestScore(m_outColl);
+ cellMatrix[i].push_back(ChartCellCache(endPos, cellLabel, score));
+ }
+ }
+ }
+}
+
+// if a (partial) rule matches, add it to list completed rules (if non-unary and non-empty), and try find expansions that have this partial rule as prefix.
+void ChartRuleLookupManagerMemoryPerSentence::AddAndExtend(
+ const PhraseDictionaryNodeMemory *node,
+ size_t endPos) {
+
+ const TargetPhraseCollection &tpc = node->GetTargetPhraseCollection();
+ // add target phrase collection (except if rule is empty or a unary non-terminal rule)
+ if (!tpc.IsEmpty() && (m_stackVec.empty() || endPos != m_unaryPos)) {
+ m_completedRules[endPos].Add(tpc, m_stackVec, m_stackScores, *m_outColl);
+ }
+
+ // get all further extensions of rule (until reaching end of sentence or max-chart-span)
+ if (endPos < m_lastPos) {
+ if (!node->GetTerminalMap().empty()) {
+ GetTerminalExtension(node, endPos+1);
+ }
+ if (!node->GetNonTerminalMap().empty()) {
+ GetNonTerminalExtension(node, endPos+1);
+ }
+ }
}
+
// search all possible terminal extensions of a partial rule (pointed at by node) at a given position
// recursively try to expand partial rules into full rules up to m_lastPos.
void ChartRuleLookupManagerMemoryPerSentence::GetTerminalExtension(
- const PhraseDictionaryNodeMemory *node,
- size_t pos)
-{
-
- const Word &sourceWord = GetSourceAt(pos).GetLabel();
- const PhraseDictionaryNodeMemory::TerminalMap & terminals = node->GetTerminalMap();
-
- // if node has small number of terminal edges, test word equality for each.
- if (terminals.size() < 5) {
- for (PhraseDictionaryNodeMemory::TerminalMap::const_iterator iter = terminals.begin(); iter != terminals.end(); ++iter) {
- const Word & word = iter->first;
- if (word == sourceWord) {
- const PhraseDictionaryNodeMemory *child = & iter->second;
- AddAndExtend(child, pos, NULL);
+ const PhraseDictionaryNodeMemory *node,
+ size_t pos) {
+
+ const Word &sourceWord = GetSourceAt(pos).GetLabel();
+ const PhraseDictionaryNodeMemory::TerminalMap & terminals = node->GetTerminalMap();
+
+ // if node has small number of terminal edges, test word equality for each.
+ if (terminals.size() < 5) {
+ for (PhraseDictionaryNodeMemory::TerminalMap::const_iterator iter = terminals.begin(); iter != terminals.end(); ++iter) {
+ const Word & word = iter->first;
+ if (TerminalEqualityPred()(word, sourceWord)) {
+ const PhraseDictionaryNodeMemory *child = & iter->second;
+ AddAndExtend(child, pos);
+ break;
+ }
}
}
- }
- // else, do hash lookup
- else {
- const PhraseDictionaryNodeMemory *child = node->GetChild(sourceWord);
- if (child != NULL) {
- AddAndExtend(child, pos, NULL);
+ // else, do hash lookup
+ else {
+ const PhraseDictionaryNodeMemory *child = node->GetChild(sourceWord);
+ if (child != NULL) {
+ AddAndExtend(child, pos);
+ }
}
- }
}
-// search all nonterminal possible nonterminal extensions of a partial rule (pointed at by node) for a given span (StartPos, endPos).
+// search all nonterminal possible nonterminal extensions of a partial rule (pointed at by node) for a variable span (starting from startPos).
// recursively try to expand partial rules into full rules up to m_lastPos.
void ChartRuleLookupManagerMemoryPerSentence::GetNonTerminalExtension(
- const PhraseDictionaryNodeMemory *node,
- size_t startPos,
- size_t endPos)
-{
-
- // target non-terminal labels for the span
- const ChartCellLabelSet &targetNonTerms = GetTargetLabelSet(startPos, endPos);
-
- if (targetNonTerms.GetSize() == 0) {
- return;
- }
+ const PhraseDictionaryNodeMemory *node,
+ size_t startPos) {
-#if !defined(UNLABELLED_SOURCE)
- // source non-terminal labels for the span
- const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
- const std::vector<bool> &sourceNonTermArray = inputPath.GetNonTerminalArray();
+ const CompressedMatrix &compressedMatrix = m_compressedMatrixVec[startPos];
- // can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
- if (inputPath.GetNonTerminalSet().size() == 0) {
- return;
- }
-#endif
+ // non-terminal labels in phrase dictionary node
+ const PhraseDictionaryNodeMemory::NonTerminalMap & nonTermMap = node->GetNonTerminalMap();
- // non-terminal labels in phrase dictionary node
- const PhraseDictionaryNodeMemory::NonTerminalMap & nonTermMap = node->GetNonTerminalMap();
+ // make room for back pointer
+ m_stackVec.push_back(NULL);
+ m_stackScores.push_back(0);
- // loop over possible expansions of the rule
- PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
- PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = nonTermMap.end();
- for (p = nonTermMap.begin(); p != end; ++p) {
- // does it match possible source and target non-terminals?
+ // loop over possible expansions of the rule
+ PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
+ PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = nonTermMap.end();
+ for (p = nonTermMap.begin(); p != end; ++p) {
+ // does it match possible source and target non-terminals?
#if defined(UNLABELLED_SOURCE)
- const Word &targetNonTerm = p->first;
+ const Word &targetNonTerm = p->first;
#else
- const PhraseDictionaryNodeMemory::NonTerminalMapKey &key = p->first;
- const Word &sourceNonTerm = key.first;
- // check if source label matches
- if (! sourceNonTermArray[sourceNonTerm[0]->GetId()]) {
- continue;
- }
- const Word &targetNonTerm = key.second;
+ const Word &targetNonTerm = p->first.second;
#endif
-
- //soft matching of NTs
- if (m_isSoftMatching && !m_softMatchingMap[targetNonTerm[0]->GetId()].empty()) {
- const std::vector<Word>& softMatches = m_softMatchingMap[targetNonTerm[0]->GetId()];
- for (std::vector<Word>::const_iterator softMatch = softMatches.begin(); softMatch != softMatches.end(); ++softMatch) {
- const ChartCellLabel *cellLabel = targetNonTerms.Find(*softMatch);
- if (cellLabel == NULL) {
- continue;
+ const PhraseDictionaryNodeMemory *child = &p->second;
+ //soft matching of NTs
+ if (m_isSoftMatching && !m_softMatchingMap[targetNonTerm[0]->GetId()].empty()) {
+ const std::vector<Word>& softMatches = m_softMatchingMap[targetNonTerm[0]->GetId()];
+ for (std::vector<Word>::const_iterator softMatch = softMatches.begin(); softMatch != softMatches.end(); ++softMatch) {
+ const CompressedColumn &matches = compressedMatrix[(*softMatch)[0]->GetId()];
+ for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
+ m_stackVec.back() = match->cellLabel;
+ m_stackScores.back() = match->score;
+ AddAndExtend(child, match->endPos);
+ }
}
- // create new rule
- const PhraseDictionaryNodeMemory &child = p->second;
- AddAndExtend(&child, endPos, cellLabel);
- }
- } // end of soft matches lookup
+ } // end of soft matches lookup
- const ChartCellLabel *cellLabel = targetNonTerms.Find(targetNonTerm);
- if (cellLabel == NULL) {
- continue;
+ const CompressedColumn &matches = compressedMatrix[targetNonTerm[0]->GetId()];
+ for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
+ m_stackVec.back() = match->cellLabel;
+ m_stackScores.back() = match->score;
+ AddAndExtend(child, match->endPos);
+ }
}
- // create new rule
- const PhraseDictionaryNodeMemory &child = p->second;
- AddAndExtend(&child, endPos, cellLabel);
- }
+ // remove last back pointer
+ m_stackVec.pop_back();
+ m_stackScores.pop_back();
}
-
} // namespace Moses
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
index c2553199b..d6a6f6535 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
@@ -40,6 +40,9 @@ class WordsRange;
class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYKPlus
{
public:
+ typedef std::vector<ChartCellCache> CompressedColumn;
+ typedef std::vector<CompressedColumn> CompressedMatrix;
+
ChartRuleLookupManagerMemoryPerSentence(const ChartParser &parser,
const ChartCellCollectionBase &cellColl,
const PhraseDictionaryFuzzyMatch &ruleTable);
@@ -59,13 +62,15 @@ private:
void GetNonTerminalExtension(
const PhraseDictionaryNodeMemory *node,
- size_t startPos,
- size_t endPos);
+ size_t startPos);
void AddAndExtend(
const PhraseDictionaryNodeMemory *node,
+ size_t endPos);
+
+ void UpdateCompressedMatrix(size_t startPos,
size_t endPos,
- const ChartCellLabel *cellLabel);
+ size_t lastPos);
const PhraseDictionaryFuzzyMatch &m_ruleTable;
@@ -80,8 +85,12 @@ private:
size_t m_unaryPos;
StackVec m_stackVec;
+ std::vector<float> m_stackScores;
+ std::vector<const Word*> m_sourceWords;
ChartParserCallback* m_outColl;
+ std::vector<CompressedMatrix> m_compressedMatrixVec;
+
};
} // namespace Moses
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp
index 7a6a66dfd..a57e66881 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp
@@ -39,14 +39,12 @@ ChartRuleLookupManagerOnDisk::ChartRuleLookupManagerOnDisk(
const PhraseDictionaryOnDisk &dictionary,
OnDiskPt::OnDiskWrapper &dbWrapper,
const std::vector<FactorType> &inputFactorsVec,
- const std::vector<FactorType> &outputFactorsVec,
- const std::string &filePath)
+ const std::vector<FactorType> &outputFactorsVec)
: ChartRuleLookupManagerCYKPlus(parser, cellColl)
, m_dictionary(dictionary)
, m_dbWrapper(dbWrapper)
, m_inputFactorsVec(inputFactorsVec)
, m_outputFactorsVec(outputFactorsVec)
- , m_filePath(filePath)
{
UTIL_THROW_IF2(m_expandableDottedRuleListVec.size() != 0,
"Dotted rule collection not correctly initialized");
@@ -82,6 +80,8 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
ChartParserCallback &outColl)
{
const StaticData &staticData = StaticData::Instance();
+ const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal();
+
size_t relEndPos = range.GetEndPos() - range.GetStartPos();
size_t absEndPos = range.GetEndPos();
@@ -137,8 +137,6 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
stackInd = relEndPos + 1;
}
- // size_t nonTermNumWordsCovered = endPos - startPos + 1;
-
// get target nonterminals in this span from chart
const ChartCellLabelSet &chartNonTermSet =
GetTargetLabelSet(startPos, endPos);
@@ -174,11 +172,18 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
}
const ChartCellLabel &cellLabel = **iterChartNonTerm;
- //cerr << sourceLHS << " " << defaultSourceNonTerm << " " << chartNonTerm << " " << defaultTargetNonTerm << endl;
+ bool doSearch = true;
+ if (m_dictionary.m_maxSpanDefault != NOT_FOUND) {
+ // for Hieu's source syntax
+
+ bool isSourceSyntaxNonTerm = sourceLHS != defaultSourceNonTerm;
+ size_t nonTermNumWordsCovered = endPos - startPos + 1;
- //bool isSyntaxNonTerm = (sourceLHS != defaultSourceNonTerm) || (chartNonTerm != defaultTargetNonTerm);
- bool doSearch = true; //isSyntaxNonTerm ? nonTermNumWordsCovered <= maxSyntaxSpan :
- // nonTermNumWordsCovered <= maxDefaultSpan;
+ doSearch = isSourceSyntaxNonTerm ?
+ nonTermNumWordsCovered <= m_dictionary.m_maxSpanLabelled :
+ nonTermNumWordsCovered <= m_dictionary.m_maxSpanDefault;
+
+ }
if (doSearch) {
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
index 6213d3b67..6f2f71cdd 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
@@ -41,8 +41,7 @@ public:
const PhraseDictionaryOnDisk &dictionary,
OnDiskPt::OnDiskWrapper &dbWrapper,
const std::vector<FactorType> &inputFactorsVec,
- const std::vector<FactorType> &outputFactorsVec,
- const std::string &filePath);
+ const std::vector<FactorType> &outputFactorsVec);
~ChartRuleLookupManagerOnDisk();
@@ -55,7 +54,6 @@ private:
OnDiskPt::OnDiskWrapper &m_dbWrapper;
const std::vector<FactorType> &m_inputFactorsVec;
const std::vector<FactorType> &m_outputFactorsVec;
- const std::string &m_filePath;
std::vector<DottedRuleStackOnDisk*> m_expandableDottedRuleListVec;
std::map<UINT64, const TargetPhraseCollection*> m_cache;
std::list<const OnDiskPt::PhraseNode*> m_sourcePhraseNode;
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp
index 81cfa8c38..93ddc82db 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp
@@ -77,7 +77,7 @@ TargetPhrase *ChartRuleLookupManagerSkeleton::CreateTargetPhrase(const Word &sou
string str = sourceWord.GetFactor(0)->GetString().as_string();
str = "ChartManagerSkeleton:" + str;
- TargetPhrase *tp = new TargetPhrase();
+ TargetPhrase *tp = new TargetPhrase(&m_skeletonPT);
Word &word = tp->AddWord();
word.CreateFromString(Output, m_skeletonPT.GetOutput(), str, false);
diff --git a/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.cpp b/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.cpp
index d90a3aa5e..332a01499 100644
--- a/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.cpp
+++ b/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.cpp
@@ -77,4 +77,47 @@ void CompletedRuleCollection::Add(const TargetPhraseCollection &tpc,
}
}
+
+// copies some functionality (pruning) from ChartTranslationOptionList::Add
+void CompletedRuleCollection::Add(const TargetPhraseCollection &tpc,
+ const StackVec &stackVec,
+ const std::vector<float> &stackScores,
+ const ChartParserCallback &outColl)
+{
+ if (tpc.IsEmpty()) {
+ return;
+ }
+
+ const TargetPhrase &targetPhrase = **(tpc.begin());
+ float score = std::accumulate(stackScores.begin(), stackScores.end(), targetPhrase.GetFutureScore());
+
+ // If the rule limit has already been reached then don't add the option
+ // unless it is better than at least one existing option.
+ if (m_collection.size() > m_ruleLimit && score < m_scoreThreshold) {
+ return;
+ }
+
+ CompletedRule *completedRule = new CompletedRule(tpc, stackVec, score);
+ m_collection.push_back(completedRule);
+
+ // If the rule limit hasn't been exceeded then update the threshold.
+ if (m_collection.size() <= m_ruleLimit) {
+ m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
+ }
+
+ // Prune if bursting
+ if (m_collection.size() == m_ruleLimit * 2) {
+ NTH_ELEMENT4(m_collection.begin(),
+ m_collection.begin() + m_ruleLimit - 1,
+ m_collection.end(),
+ CompletedRuleOrdered());
+ m_scoreThreshold = m_collection[m_ruleLimit-1]->GetScoreEstimate();
+ for (size_t i = 0 + m_ruleLimit; i < m_collection.size(); i++) {
+ delete m_collection[i];
+
+ }
+ m_collection.resize(m_ruleLimit);
+ }
+}
+
}
diff --git a/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h b/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h
index ffa353ac4..20e8d0e16 100644
--- a/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h
+++ b/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h
@@ -22,6 +22,7 @@
#define moses_CompletedRuleCollectionS_h
#include <vector>
+#include <numeric>
#include "moses/StackVec.h"
#include "moses/TargetPhraseCollection.h"
@@ -103,6 +104,11 @@ public:
const StackVec &stackVec,
const ChartParserCallback &outColl);
+ void Add(const TargetPhraseCollection &tpc,
+ const StackVec &stackVec,
+ const std::vector<float> &stackScores,
+ const ChartParserCallback &outColl);
+
private:
std::vector<CompletedRule*> m_collection;
float m_scoreThreshold;
diff --git a/moses/TranslationModel/CompactPT/MurmurHash3.cpp b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
index d16cd9502..4e4cd4258 100644
--- a/moses/TranslationModel/CompactPT/MurmurHash3.cpp
+++ b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
@@ -29,7 +29,7 @@
#else // defined(_MSC_VER)
-#define FORCE_INLINE __attribute__((always_inline))
+#define FORCE_INLINE inline __attribute__((always_inline))
inline uint32_t rotl32 ( uint32_t x, int8_t r )
{
diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
index 0fc12adf2..cfd318d70 100644
--- a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
@@ -418,7 +418,7 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
}
if(eval) {
- targetPhrase->Evaluate(sourcePhrase);
+ targetPhrase->EvaluateInIsolation(sourcePhrase, m_phraseDictionary.GetFeaturesToApply());
}
if(m_coding == PREnc) {
diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
index d6860a43b..a387a5a14 100644
--- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
@@ -57,6 +57,8 @@ void PhraseDictionaryCompact::Load()
{
const StaticData &staticData = StaticData::Instance();
+ SetFeaturesToApply();
+
m_weight = staticData.GetWeights(this);
std::string tFilePath = m_filePath;
diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
index a2bd879b4..3bf0d2820 100644
--- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
@@ -426,7 +426,7 @@ void PhraseTableCreator::AddTargetSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
- = m_sourceSymbolsMap.find(symbol);
+ = m_sourceSymbolsMap.find(symbol);
if(it != m_sourceSymbolsMap.end())
return it->second;
@@ -437,7 +437,7 @@ unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
- = m_targetSymbolsMap.find(symbol);
+ = m_targetSymbolsMap.find(symbol);
if(it != m_targetSymbolsMap.end())
return it->second;
@@ -451,7 +451,7 @@ unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol)
boost::mutex::scoped_lock lock(m_mutex);
#endif
boost::unordered_map<std::string, unsigned>::iterator it
- = m_targetSymbolsMap.find(symbol);
+ = m_targetSymbolsMap.find(symbol);
if(it != m_targetSymbolsMap.end())
return it->second;
@@ -714,10 +714,10 @@ std::string PhraseTableCreator::EncodeLine(std::vector<std::string>& tokens, siz
std::vector<float> scores = Tokenize<float>(scoresStr);
if(scores.size() != m_numScoreComponent) {
- std::stringstream strme;
- strme << "Error: Wrong number of scores detected ("
- << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
- strme << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[3] << " ..." << std::endl;
+ std::stringstream strme;
+ strme << "Error: Wrong number of scores detected ("
+ << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
+ strme << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[2] << " ..." << std::endl;
UTIL_THROW2(strme.str());
}
@@ -1040,30 +1040,30 @@ void RankingTask::operator()()
*it = Moses::Trim(*it);
if(tokens.size() < 4) {
- std::stringstream strme;
- strme << "Error: It seems the following line has a wrong format:" << std::endl;
- strme << "Line " << i << ": " << lines[i] << std::endl;
+ std::stringstream strme;
+ strme << "Error: It seems the following line has a wrong format:" << std::endl;
+ strme << "Line " << i << ": " << lines[i] << std::endl;
UTIL_THROW2(strme.str());
}
if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
- std::stringstream strme;
- strme << "Error: It seems the following line contains no alignment information, " << std::endl;
- strme << "but you are using ";
- strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
- strme << " encoding which makes use of alignment data. " << std::endl;
- strme << "Use -encoding None" << std::endl;
- strme << "Line " << i << ": " << lines[i] << std::endl;
+ std::stringstream strme;
+ strme << "Error: It seems the following line contains no alignment information, " << std::endl;
+ strme << "but you are using ";
+ strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
+ strme << " encoding which makes use of alignment data. " << std::endl;
+ strme << "Use -encoding None" << std::endl;
+ strme << "Line " << i << ": " << lines[i] << std::endl;
UTIL_THROW2(strme.str());
}
std::vector<float> scores = Tokenize<float>(tokens[2]);
if(scores.size() != m_creator.m_numScoreComponent) {
- std::stringstream strme;
- strme << "Error: It seems the following line has a wrong number of scores ("
- << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl;
- strme << "Line " << i << ": " << lines[i] << std::endl;
- UTIL_THROW2(strme.str());
+ std::stringstream strme;
+ strme << "Error: It seems the following line has a wrong number of scores ("
+ << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl;
+ strme << "Line " << i << ": " << lines[i] << std::endl;
+ UTIL_THROW2(strme.str());
}
float sortScore = scores[m_creator.m_sortScoreIndex];
@@ -1140,20 +1140,20 @@ void EncodingTask::operator()()
*it = Moses::Trim(*it);
if(tokens.size() < 3) {
- std::stringstream strme;
- strme << "Error: It seems the following line has a wrong format:" << std::endl;
- strme << "Line " << i << ": " << lines[i] << std::endl;
+ std::stringstream strme;
+ strme << "Error: It seems the following line has a wrong format:" << std::endl;
+ strme << "Line " << i << ": " << lines[i] << std::endl;
UTIL_THROW2(strme.str());
}
- if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
- std::stringstream strme;
- strme << "Error: It seems the following line contains no alignment information, " << std::endl;
- strme << "but you are using ";
- strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
- strme << " encoding which makes use of alignment data. " << std::endl;
- strme << "Use -encoding None" << std::endl;
- strme << "Line " << i << ": " << lines[i] << std::endl;
+ if(tokens.size() > 3 && tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
+ std::stringstream strme;
+ strme << "Error: It seems the following line contains no alignment information, " << std::endl;
+ strme << "but you are using ";
+ strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
+ strme << " encoding which makes use of alignment data. " << std::endl;
+ strme << "Use -encoding None" << std::endl;
+ strme << "Line " << i << ": " << lines[i] << std::endl;
UTIL_THROW2(strme.str());
}
@@ -1218,7 +1218,7 @@ void CompressionTask::operator()()
while(collectionNum < m_encodedCollections.size()) {
std::string collection = m_encodedCollections[collectionNum];
std::string compressedCollection
- = m_creator.CompressEncodedCollection(collection);
+ = m_creator.CompressEncodedCollection(collection);
std::string dummy;
PackedItem packedItem(collectionNum, dummy, compressedCollection, 0);
diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h
index 06f3c24e0..bb2bc11ef 100644
--- a/moses/TranslationModel/CompactPT/StringVector.h
+++ b/moses/TranslationModel/CompactPT/StringVector.h
@@ -86,7 +86,8 @@ protected:
virtual const ValueT* value_ptr(PosT i) const;
public:
- typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
+ //typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
+ typedef ValueIteratorRange<const ValueT *> range;
// ********** RangeIterator **********
@@ -174,8 +175,10 @@ public:
iterator end() const;
PosT length(PosT i) const;
- typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
- typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
+ //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
+ //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
+ const ValueT* begin(PosT i) const;
+ const ValueT* end(PosT i) const;
void clear() {
m_charArray->clear();
@@ -469,15 +472,19 @@ const ValueT* StringVector<ValueT, PosT, Allocator>::value_ptr(PosT i) const
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
-typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::begin(PosT i) const
+//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::begin(PosT i) const
+const ValueT* StringVector<ValueT, PosT, Allocator>::begin(PosT i) const
{
- return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
+ //return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
+ return value_ptr(i);
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
-typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::end(PosT i) const
+//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::end(PosT i) const
+const ValueT* StringVector<ValueT, PosT, Allocator>::end(PosT i) const
{
- return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
+ //return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
+ return value_ptr(i) + length(i);
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
diff --git a/moses/TranslationModel/DynSAInclude/FileHandler.cpp b/moses/TranslationModel/DynSAInclude/FileHandler.cpp
index 5f9cd7c45..ffde4a0f3 100644
--- a/moses/TranslationModel/DynSAInclude/FileHandler.cpp
+++ b/moses/TranslationModel/DynSAInclude/FileHandler.cpp
@@ -71,13 +71,13 @@ bool FileHandler::setStreamBuffer(bool checkExists)
{
// redirect stdin or stdout if necesary
if (path_ == FileHandler::kStdInDescriptor) {
- UTIL_THROW_IF2(flags_ & std::ios::in == 0,
- "Incorrect flags: " << flags_);
+ UTIL_THROW_IF2((flags_ & std::ios::in) == 0,
+ "Incorrect flags: " << flags_);
std::streambuf* sb = std::cin.rdbuf();
buffer_ = sb;
} else if (path_ == FileHandler::kStdOutDescriptor) {
- UTIL_THROW_IF2(flags_ & std::ios::out == 0,
- "Incorrect flags: " << flags_);
+ UTIL_THROW_IF2((flags_ & std::ios::out) == 0,
+ "Incorrect flags: " << flags_);
std::streambuf* sb = std::cout.rdbuf();
buffer_ = sb;
} else {
diff --git a/moses/TranslationModel/DynSAInclude/params.cpp b/moses/TranslationModel/DynSAInclude/params.cpp
index 4696258f8..03ad48446 100644
--- a/moses/TranslationModel/DynSAInclude/params.cpp
+++ b/moses/TranslationModel/DynSAInclude/params.cpp
@@ -123,11 +123,12 @@ bool Parameters::setParamValue(const std::string& name, const std::string& val)
std::string Parameters::getParamValue(const std::string& name)
{
std::string value = Parameters::kNotSetValue;
- if(isValidParamName(name))
+ if(isValidParamName(name)) {
if(params_.find(name) != params_.end())
value = params_[name].value;
else if(getValueType(name) == kBoolValue)
value = kFalseValue;
+ }
return value;
}
std::string Parameters::getParam(const std::string& name)
diff --git a/moses/TranslationModel/PhraseDictionary.cpp b/moses/TranslationModel/PhraseDictionary.cpp
index 026729be9..e0251b907 100644
--- a/moses/TranslationModel/PhraseDictionary.cpp
+++ b/moses/TranslationModel/PhraseDictionary.cpp
@@ -24,6 +24,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/InputType.h"
#include "moses/TranslationOption.h"
#include "moses/UserMessage.h"
+#include "moses/DecodeStep.h"
#include "moses/DecodeGraph.h"
#include "moses/InputPath.h"
#include "util/exception.hh"
@@ -36,11 +37,11 @@ std::vector<PhraseDictionary*> PhraseDictionary::s_staticColl;
CacheColl::~CacheColl()
{
- for (iterator iter = begin(); iter != end(); ++iter) {
- std::pair<const TargetPhraseCollection*, clock_t> &key = iter->second;
- const TargetPhraseCollection *tps = key.first;
- delete tps;
- }
+ for (iterator iter = begin(); iter != end(); ++iter) {
+ std::pair<const TargetPhraseCollection*, clock_t> &key = iter->second;
+ const TargetPhraseCollection *tps = key.first;
+ delete tps;
+ }
}
PhraseDictionary::PhraseDictionary(const std::string &line)
@@ -48,7 +49,8 @@ PhraseDictionary::PhraseDictionary(const std::string &line)
,m_tableLimit(20) // default
,m_maxCacheSize(DEFAULT_MAX_TRANS_OPT_CACHE_SIZE)
{
- s_staticColl.push_back(this);
+ m_id = s_staticColl.size();
+ s_staticColl.push_back(this);
}
bool
@@ -137,22 +139,22 @@ SetFeaturesToApply()
}
}
+
+ // tell the Phrase Dictionary that the TargetPhraseCollection is not needed any more
+ void
+ PhraseDictionary::
+ Release(TargetPhraseCollection const* tpc) const
+ {
+ // do nothing by default
+ return;
+ }
-// tell the Phrase Dictionary that the TargetPhraseCollection is not needed any more
-void
-PhraseDictionary::
-Release(TargetPhraseCollection const* tpc) const
-{
- // do nothing by default
- return;
-}
-
-bool
-PhraseDictionary::
-PrefixExists(Phrase const& phrase) const
-{
- return true;
-}
+ bool
+ PhraseDictionary::
+ PrefixExists(Phrase const& phrase) const
+ {
+ return true;
+ }
void
PhraseDictionary::
@@ -164,7 +166,7 @@ GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
// backoff
if (!SatisfyBackoff(inputPath)) {
- continue;
+ continue;
}
const Phrase &phrase = inputPath.GetPhrase();
@@ -173,6 +175,32 @@ GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
}
}
+// persistent cache handling
+// saving presistent cache to disk
+//void PhraseDictionary::SaveCache() const
+//{
+// CacheColl &cache = GetCache();
+// for( std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iter,
+// iter != cache.end(),
+// iter++ ) {
+//
+// }
+//}
+
+// loading persistent cache from disk
+//void PhraseDictionary::LoadCache() const
+//{
+// CacheColl &cache = GetCache();
+// std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iter;
+// iter = cache.begin();
+// while( iter != cache.end() ) {
+// std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iterRemove = iter++;
+// delete iterRemove->second.first;
+// cache.erase(iterRemove);
+// }
+//}
+
+// reduce presistent cache by half of maximum size
void PhraseDictionary::ReduceCache() const
{
Timer reduceCacheTime;
@@ -225,25 +253,25 @@ bool PhraseDictionary::SatisfyBackoff(const InputPath &inputPath) const
size_t backoff = decodeGraph.GetBackoff();
if (backoff == 0) {
- // ie. don't backoff. Collect ALL translations
- return true;
+ // ie. don't backoff. Collect ALL translations
+ return true;
}
if (sourcePhrase.GetSize() > backoff) {
- // source phrase too big
- return false;
+ // source phrase too big
+ return false;
}
// lookup translation only if no other translations
InputPath::TargetPhrases::const_iterator iter;
for (iter = inputPath.GetTargetPhrases().begin(); iter != inputPath.GetTargetPhrases().end(); ++iter) {
- const std::pair<const TargetPhraseCollection*, const void*> &temp = iter->second;
- const TargetPhraseCollection *tpCollPrev = temp.first;
+ const std::pair<const TargetPhraseCollection*, const void*> &temp = iter->second;
+ const TargetPhraseCollection *tpCollPrev = temp.first;
- if (tpCollPrev && tpCollPrev->GetSize()) {
- // already have translation from another pt. Don't create translations
- return false;
- }
+ if (tpCollPrev && tpCollPrev->GetSize()) {
+ // already have translation from another pt. Don't create translations
+ return false;
+ }
}
return true;
diff --git a/moses/TranslationModel/PhraseDictionary.h b/moses/TranslationModel/PhraseDictionary.h
index e7fa3411f..f1b938db0 100644
--- a/moses/TranslationModel/PhraseDictionary.h
+++ b/moses/TranslationModel/PhraseDictionary.h
@@ -87,6 +87,10 @@ public:
return m_tableLimit;
}
+ //! continguous id for each pt, starting from 0
+ size_t GetId() const
+ { return m_id; }
+
virtual
void
Release(TargetPhraseCollection const* tpc) const;
@@ -167,6 +171,7 @@ protected:
protected:
CacheColl &GetCache() const;
+ size_t m_id;
};
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
index 3d2b0af08..83b78fe5b 100644
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
@@ -58,9 +58,9 @@ GetTargetPhraseCollectionLEGACY(const Phrase& src) const
TargetPhraseCollection *ret = new TargetPhraseCollection();
BOOST_FOREACH(pstat_entry & e, pstats) {
- TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src);
+ TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src, this);
tp->GetScoreBreakdown().Assign(this,e.second);
- tp->Evaluate(src);
+ tp->EvaluateInIsolation(src);
ret->Add(tp);
}
// return ret;
diff --git a/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp b/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp
index d584429bf..e9c656937 100644
--- a/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp
@@ -163,7 +163,7 @@ const TargetPhraseCollection *PhraseDictionaryDynamicCacheBased::GetTargetPhrase
std::vector<const TargetPhrase*>::const_iterator it2 = tpc->begin();
while (it2 != tpc->end()) {
- ((TargetPhrase*) *it2)->Evaluate(source, GetFeaturesToApply());
+ ((TargetPhrase*) *it2)->EvaluateInIsolation(source, GetFeaturesToApply());
it2++;
}
}
@@ -515,7 +515,7 @@ void PhraseDictionaryDynamicCacheBased::Update(std::string sourcePhraseString, s
const StaticData &staticData = StaticData::Instance();
const std::string& factorDelimiter = staticData.GetFactorDelimiter();
Phrase sourcePhrase(0);
- Phrase targetPhrase(0);
+ TargetPhrase targetPhrase(0);
VERBOSE(3, "ageString:|" << ageString << "|" << std::endl);
char *err_ind_temp;
@@ -541,9 +541,9 @@ void PhraseDictionaryDynamicCacheBased::Update(std::string sourcePhraseString, s
Update(sourcePhrase, targetPhrase, age, waString);
}
-void PhraseDictionaryDynamicCacheBased::Update(Phrase sp, Phrase tp, int age, std::string waString)
+void PhraseDictionaryDynamicCacheBased::Update(Phrase sp, TargetPhrase tp, int age, std::string waString)
{
- VERBOSE(3,"PhraseDictionaryDynamicCacheBased::Update(Phrase sp, Phrase tp, int age, std::string waString)" << std::endl);
+ VERBOSE(3,"PhraseDictionaryDynamicCacheBased::Update(Phrase sp, TargetPhrase tp, int age, std::string waString)" << std::endl);
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
#endif
@@ -560,14 +560,15 @@ void PhraseDictionaryDynamicCacheBased::Update(Phrase sp, Phrase tp, int age, st
TargetCollectionAgePair TgtCollAgePair = it->second;
TargetPhraseCollection* tpc = TgtCollAgePair.first;
AgeCollection* ac = TgtCollAgePair.second;
+// const TargetPhrase* p_ptr = NULL;
const Phrase* p_ptr = NULL;
TargetPhrase* tp_ptr = NULL;
bool found = false;
size_t tp_pos=0;
while (!found && tp_pos < tpc->GetSize()) {
tp_ptr = (TargetPhrase*) tpc->GetTargetPhrase(tp_pos);
- p_ptr = (const Phrase*) tp_ptr;
- if (tp == *p_ptr) {
+ p_ptr = (const TargetPhrase*) tp_ptr;
+ if ((Phrase) tp == *p_ptr) {
found = true;
continue;
}
diff --git a/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h b/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h
index 909350e05..4a89f8444 100644
--- a/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h
+++ b/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h
@@ -147,7 +147,7 @@ protected:
void Decay(Phrase p); // traverse through the cache and decay each entry for a given Phrase
void Update(std::vector<std::string> entries, std::string ageString);
void Update(std::string sourceString, std::string targetString, std::string ageString, std::string waString="");
- void Update(Phrase p, Phrase tp, int age, std::string waString="");
+ void Update(Phrase p, TargetPhrase tp, int age, std::string waString="");
void ClearEntries(std::vector<std::string> entries);
void ClearEntries(std::string sourceString, std::string targetString);
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
index 591c6ca77..f226b8ba4 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
@@ -30,26 +30,37 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
{
ReadParameters();
- if (m_mode != "interpolate") {
+ if (m_mode == "interpolate") {
+ size_t numWeights = m_numScoreComponents;
+ UTIL_THROW_IF2(m_pdStr.size() != m_multimodelweights.size() &&
+ m_pdStr.size()*numWeights != m_multimodelweights.size(),
+ "Number of scores and weights are not equal");
+ } else if (m_mode == "all" || m_mode == "all-restrict") {
+ size_t componentWeights = 0;
+ for(size_t i = 0; i < m_numModels; ++i) {
+ const string &ptName = m_pdStr[i];
+ PhraseDictionary *pt = FindPhraseDictionary(ptName);
+ UTIL_THROW_IF2(pt == NULL,
+ "Could not find component phrase table " << ptName);
+ componentWeights += pt->GetNumScoreComponents();
+ }
+ UTIL_THROW_IF2(componentWeights != m_numScoreComponents,
+ "Total number of component model scores is unequal to specified number of scores");
+ } else {
ostringstream msg;
msg << "combination mode unknown: " << m_mode;
throw runtime_error(msg.str());
}
-
- size_t numWeights = m_numScoreComponents;
- UTIL_THROW_IF2(m_pdStr.size() != m_multimodelweights.size() &
- m_pdStr.size()*numWeights != m_multimodelweights.size(),
- "Number of scores and weights are not equal");
}
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(int type, const std::string &line)
:PhraseDictionary(line)
{
if (type == 1) {
- // PhraseDictionaryMultiModelCounts
+ // PhraseDictionaryMultiModelCounts
UTIL_THROW_IF2(m_pdStr.size() != m_multimodelweights.size() &&
- m_pdStr.size()*4 != m_multimodelweights.size(),
- "Number of scores and weights are not equal");
+ m_pdStr.size()*4 != m_multimodelweights.size(),
+ "Number of scores and weights are not equal");
}
}
@@ -80,7 +91,7 @@ void PhraseDictionaryMultiModel::Load()
PhraseDictionary *pt = FindPhraseDictionary(ptName);
UTIL_THROW_IF2(pt == NULL,
- "Could not find component phrase table " << ptName);
+ "Could not find component phrase table " << ptName);
m_pd.push_back(pt);
}
}
@@ -95,20 +106,23 @@ const TargetPhraseCollection *PhraseDictionaryMultiModel::GetTargetPhraseCollect
multimodelweights = getWeights(m_numScoreComponents, true);
}
- std::map<std::string,multiModelStatistics*>* allStats = new(std::map<std::string,multiModelStatistics*>);
-
- CollectSufficientStatistics(src, allStats);
-
TargetPhraseCollection *ret = NULL;
+
if (m_mode == "interpolate") {
+ std::map<std::string,multiModelStatistics*>* allStats = new(std::map<std::string,multiModelStatistics*>);
+ CollectSufficientStatistics(src, allStats);
ret = CreateTargetPhraseCollectionLinearInterpolation(src, allStats, multimodelweights);
+ RemoveAllInMap(*allStats);
+ delete allStats;
+ } else if (m_mode == "all") {
+ ret = CreateTargetPhraseCollectionAll(src, false);
+ } else if (m_mode == "all-restrict") {
+ ret = CreateTargetPhraseCollectionAll(src, true);
}
ret->NthElement(m_tableLimit); // sort the phrases for pruning later
const_cast<PhraseDictionaryMultiModel*>(this)->CacheForCleanup(ret);
- RemoveAllInMap(*allStats);
- delete allStats;
-
+
return ret;
}
@@ -147,7 +161,7 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
vector<FeatureFunction*> pd_feature;
pd_feature.push_back(m_pd[i]);
const vector<FeatureFunction*> pd_feature_const(pd_feature);
- statistics->targetPhrase->Evaluate(src, pd_feature_const);
+ statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
// zero out scores from original phrase table
statistics->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);
@@ -166,7 +180,6 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
}
}
-
TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionLinearInterpolation(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const
{
TargetPhraseCollection *ret = new TargetPhraseCollection();
@@ -186,13 +199,95 @@ TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollection
vector<FeatureFunction*> pd_feature;
pd_feature.push_back(const_cast<PhraseDictionaryMultiModel*>(this));
const vector<FeatureFunction*> pd_feature_const(pd_feature);
- statistics->targetPhrase->Evaluate(src, pd_feature_const);
+ statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
ret->Add(new TargetPhrase(*statistics->targetPhrase));
}
return ret;
}
+TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionAll(const Phrase& src, const bool restricted) const
+{
+ // Collect phrases from all models
+ std::map<std::string, multiModelPhrase*> allPhrases;
+ size_t offset = 0;
+ for(size_t i = 0; i < m_numModels; ++i) {
+ const PhraseDictionary &pd = *m_pd[i];
+
+ TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollectionLEGACY(src);
+ if (ret_raw != NULL) {
+
+ TargetPhraseCollection::iterator iterTargetPhrase, iterLast;
+ if (m_tableLimit != 0 && ret_raw->GetSize() > m_tableLimit) {
+ iterLast = ret_raw->begin() + m_tableLimit;
+ } else {
+ iterLast = ret_raw->end();
+ }
+
+ for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != iterLast; ++iterTargetPhrase) {
+ const TargetPhrase* targetPhrase = *iterTargetPhrase;
+ std::vector<float> raw_scores = targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd);
+
+ std::string targetString = targetPhrase->GetStringRep(m_output);
+ // Phrase not in collection -> add if unrestricted (all) or first model (all-restrict)
+ if (allPhrases.find(targetString) == allPhrases.end()) {
+ // all-restrict and not first model: skip adding unseen phrase
+ if (restricted && i > 0) {
+ continue;
+ }
+
+ multiModelPhrase* phrase = new multiModelPhrase;
+ phrase->targetPhrase = new TargetPhrase(*targetPhrase); //make a copy so that we don't overwrite the original phrase table info
+ // p contains scores from all models in order. Values default to zero for models that do not contain phrase.
+ phrase->p.resize(m_numScoreComponents, 0);
+
+ //correct future cost estimates and total score
+ phrase->targetPhrase->GetScoreBreakdown().InvertDenseFeatures(&pd);
+ vector<FeatureFunction*> pd_feature;
+ pd_feature.push_back(m_pd[i]);
+ const vector<FeatureFunction*> pd_feature_const(pd_feature);
+ phrase->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
+ // zero out scores from original phrase table
+ phrase->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);
+
+ allPhrases[targetString] = phrase;
+
+ }
+ multiModelPhrase* phrase = allPhrases[targetString];
+
+ for(size_t j = 0; j < pd.GetNumScoreComponents(); ++j) {
+ phrase->p[offset + j] = raw_scores[j];
+ }
+ }
+ }
+ offset += pd.GetNumScoreComponents();
+ }
+
+ // Copy accumulated score vectors to phrases
+ TargetPhraseCollection* ret = new TargetPhraseCollection();
+ for (std::map<std::string, multiModelPhrase*>::const_iterator iter = allPhrases.begin(); iter != allPhrases.end(); ++iter) {
+
+ multiModelPhrase* phrase = iter->second;
+ Scores scoreVector(m_numScoreComponents);
+
+ for(size_t i = 0; i < m_numScoreComponents; ++i) {
+ scoreVector[i] = phrase->p[i];
+ }
+
+ phrase->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+
+ //correct future cost estimates and total score
+ vector<FeatureFunction*> pd_feature;
+ pd_feature.push_back(const_cast<PhraseDictionaryMultiModel*>(this));
+ const vector<FeatureFunction*> pd_feature_const(pd_feature);
+ phrase->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
+
+ ret->Add(new TargetPhrase(*phrase->targetPhrase));
+ }
+
+ RemoveAllInMap(allPhrases);
+ return ret;
+}
//TODO: is it worth caching the results as long as weights don't change?
std::vector<std::vector<float> > PhraseDictionaryMultiModel::getWeights(size_t numWeights, bool normalize) const
@@ -323,9 +418,6 @@ void PhraseDictionaryMultiModel::SetTemporaryMultiModelWeightsVector(std::vector
vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
{
- const StaticData &staticData = StaticData::Instance();
- const string& factorDelimiter = staticData.GetFactorDelimiter();
-
map<pair<string, string>, size_t> phrase_pair_map;
for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
@@ -344,7 +436,7 @@ vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string,
map<string,multiModelStatistics*>* allStats = new(map<string,multiModelStatistics*>);
Phrase sourcePhrase(0);
- sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
+ sourcePhrase.CreateFromString(Input, m_input, source_string, NULL);
CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModel.h b/moses/TranslationModel/PhraseDictionaryMultiModel.h
index 5886a9d98..41df2e679 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModel.h
+++ b/moses/TranslationModel/PhraseDictionaryMultiModel.h
@@ -51,6 +51,14 @@ struct multiModelStatisticsOptimization: multiModelStatistics {
class OptimizationObjective;
+struct multiModelPhrase {
+ TargetPhrase *targetPhrase;
+ std::vector<float> p;
+ ~multiModelPhrase() {
+ delete targetPhrase;
+ };
+};
+
/** Implementation of a virtual phrase table constructed from multiple component phrase tables.
*/
class PhraseDictionaryMultiModel: public PhraseDictionary
@@ -66,6 +74,7 @@ public:
void Load();
virtual void CollectSufficientStatistics(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats) const;
virtual TargetPhraseCollection* CreateTargetPhraseCollectionLinearInterpolation(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const;
+ virtual TargetPhraseCollection* CreateTargetPhraseCollectionAll(const Phrase& src, const bool restricted = false) const;
std::vector<std::vector<float> > getWeights(size_t numWeights, bool normalize) const;
std::vector<float> normalizeWeights(std::vector<float> &weights) const;
void CacheForCleanup(TargetPhraseCollection* tpc);
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
index 60d3410ea..c632f9ff2 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
@@ -17,12 +17,8 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "util/exception.hh"
-
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
-#define LINE_MAX_LENGTH 100000
-#include "phrase-extract/SafeGetline.h" // for SAFE_GETLINE()
-
using namespace std;
template<typename T>
@@ -35,7 +31,7 @@ void OutputVec(const vector<T> &vec)
}
// from phrase-extract/tables-core.cpp
-vector<string> tokenize( const char* input )
+inline vector<string> tokenize( const char* input )
{
vector< string > token;
bool betweenWords = true;
@@ -193,7 +189,7 @@ void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase&
vector<FeatureFunction*> pd_feature;
pd_feature.push_back(m_pd[i]);
const vector<FeatureFunction*> pd_feature_const(pd_feature);
- statistics->targetPhrase->Evaluate(src, pd_feature_const);
+ statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
// zero out scores from original phrase table
statistics->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);
@@ -255,7 +251,7 @@ TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseColl
vector<FeatureFunction*> pd_feature;
pd_feature.push_back(const_cast<PhraseDictionaryMultiModelCounts*>(this));
const vector<FeatureFunction*> pd_feature_const(pd_feature);
- statistics->targetPhrase->Evaluate(src, pd_feature_const);
+ statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
} catch (AlignmentException& e) {
continue;
}
@@ -461,16 +457,14 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
}
istream *inFileP = &inFile;
- char line[LINE_MAX_LENGTH];
-
int i=0;
- while(true) {
+ string line;
+
+ while(getline(*inFileP, line)) {
i++;
if (i%100000 == 0) cerr << "." << flush;
- SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- if (inFileP->eof()) break;
- vector<string> token = tokenize( line );
+ vector<string> token = tokenize( line.c_str() );
if (token.size() != 4) {
cerr << "line " << i << " in " << fileName
<< " has wrong number of tokens, skipping:\n"
@@ -495,9 +489,6 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
{
- const StaticData &staticData = StaticData::Instance();
- const string& factorDelimiter = staticData.GetFactorDelimiter();
-
map<pair<string, string>, size_t> phrase_pair_map;
for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
@@ -516,7 +507,7 @@ vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<s
map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);
Phrase sourcePhrase(0);
- sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
+ sourcePhrase.CreateFromString(Input, m_input, source_string, NULL);
CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
index 722035d1e..c948b66b2 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
@@ -30,8 +30,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/UserMessage.h"
#include <exception>
-extern std::vector<std::string> tokenize( const char*);
-
namespace Moses
{
diff --git a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
index 4475ac1aa..c2ffd95da 100644
--- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
@@ -14,20 +14,20 @@ PhraseDictionaryTransliteration::PhraseDictionaryTransliteration(const std::stri
{
ReadParameters();
UTIL_THROW_IF2(m_mosesDir.empty() ||
- m_scriptDir.empty() ||
- m_externalDir.empty() ||
- m_inputLang.empty() ||
- m_outputLang.empty(), "Must specify all arguments");
+ m_scriptDir.empty() ||
+ m_externalDir.empty() ||
+ m_inputLang.empty() ||
+ m_outputLang.empty(), "Must specify all arguments");
}
void PhraseDictionaryTransliteration::Load()
{
- SetFeaturesToApply();
+ SetFeaturesToApply();
}
void PhraseDictionaryTransliteration::CleanUpAfterSentenceProcessing(const InputType& source)
{
- ReduceCache();
+ ReduceCache();
}
void PhraseDictionaryTransliteration::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
@@ -38,14 +38,14 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollectionBatch(const Input
InputPath &inputPath = **iter;
if (!SatisfyBackoff(inputPath)) {
- continue;
+ continue;
}
const Phrase &sourcePhrase = inputPath.GetPhrase();
if (sourcePhrase.GetSize() != 1) {
- // only translit single words. A limitation of the translit script
- continue;
+ // only translit single words. A limitation of the translit script
+ continue;
}
GetTargetPhraseCollection(inputPath);
@@ -54,89 +54,90 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollectionBatch(const Input
void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &inputPath) const
{
- const Phrase &sourcePhrase = inputPath.GetPhrase();
- size_t hash = hash_value(sourcePhrase);
+ const Phrase &sourcePhrase = inputPath.GetPhrase();
+ size_t hash = hash_value(sourcePhrase);
- CacheColl &cache = GetCache();
+ CacheColl &cache = GetCache();
- CacheColl::iterator iter;
- iter = cache.find(hash);
+ CacheColl::iterator iter;
+ iter = cache.find(hash);
- if (iter != cache.end()) {
- // already in cache
- const TargetPhraseCollection *tpColl = iter->second.first;
- inputPath.SetTargetPhrases(*this, tpColl, NULL);
- } else {
- // TRANSLITERATE
- char *ptr = tmpnam(NULL);
- string inFile(ptr);
- ptr = tmpnam(NULL);
- string outDir(ptr);
-
- ofstream inStream(inFile.c_str());
- inStream << sourcePhrase.ToString() << endl;
- inStream.close();
-
- string cmd = m_scriptDir + "/Transliteration/prepare-transliteration-phrase-table.pl" +
- " --transliteration-model-dir " + m_filePath +
- " --moses-src-dir " + m_mosesDir +
- " --external-bin-dir " + m_externalDir +
- " --input-extension " + m_inputLang +
- " --output-extension " + m_outputLang +
- " --oov-file " + inFile +
- " --out-dir " + outDir;
-
- int ret = system(cmd.c_str());
- UTIL_THROW_IF2(ret != 0, "Transliteration script error");
-
- TargetPhraseCollection *tpColl = new TargetPhraseCollection();
- vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir);
- vector<TargetPhrase*>::const_iterator iter;
- for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
- TargetPhrase *tp = *iter;
- tpColl->Add(tp);
+ if (iter != cache.end()) {
+ // already in cache
+ const TargetPhraseCollection *tpColl = iter->second.first;
+ inputPath.SetTargetPhrases(*this, tpColl, NULL);
+ }
+ else {
+ // TRANSLITERATE
+ char *ptr = tmpnam(NULL);
+ string inFile(ptr);
+ ptr = tmpnam(NULL);
+ string outDir(ptr);
+
+ ofstream inStream(inFile.c_str());
+ inStream << sourcePhrase.ToString() << endl;
+ inStream.close();
+
+ string cmd = m_scriptDir + "/Transliteration/prepare-transliteration-phrase-table.pl" +
+ " --transliteration-model-dir " + m_filePath +
+ " --moses-src-dir " + m_mosesDir +
+ " --external-bin-dir " + m_externalDir +
+ " --input-extension " + m_inputLang +
+ " --output-extension " + m_outputLang +
+ " --oov-file " + inFile +
+ " --out-dir " + outDir;
+
+ int ret = system(cmd.c_str());
+ UTIL_THROW_IF2(ret != 0, "Transliteration script error");
+
+ TargetPhraseCollection *tpColl = new TargetPhraseCollection();
+ vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir);
+ vector<TargetPhrase*>::const_iterator iter;
+ for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
+ TargetPhrase *tp = *iter;
+ tpColl->Add(tp);
+ }
+
+ std::pair<const TargetPhraseCollection*, clock_t> value(tpColl, clock());
+ cache[hash] = value;
+
+ inputPath.SetTargetPhrases(*this, tpColl, NULL);
+
+ // clean up temporary files
+ remove(inFile.c_str());
+
+ cmd = "rm -rf " + outDir;
+ system(cmd.c_str());
}
-
- std::pair<const TargetPhraseCollection*, clock_t> value(tpColl, clock());
- cache[hash] = value;
-
- inputPath.SetTargetPhrases(*this, tpColl, NULL);
-
- // clean up temporary files
- remove(inFile.c_str());
-
- cmd = "rm -rf " + outDir;
- system(cmd.c_str());
- }
}
std::vector<TargetPhrase*> PhraseDictionaryTransliteration::CreateTargetPhrases(const Phrase &sourcePhrase, const string &outDir) const
{
- std::vector<TargetPhrase*> ret;
+ std::vector<TargetPhrase*> ret;
- string outPath = outDir + "/out.txt";
- ifstream outStream(outPath.c_str());
+ string outPath = outDir + "/out.txt";
+ ifstream outStream(outPath.c_str());
- string line;
- while (getline(outStream, line)) {
- vector<string> toks;
- Tokenize(toks, line, "\t");
- UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore");
+ string line;
+ while (getline(outStream, line)) {
+ vector<string> toks;
+ Tokenize(toks, line, "\t");
+ UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore");
- TargetPhrase *tp = new TargetPhrase();
- Word &word = tp->AddWord();
- word.CreateFromString(Output, m_output, toks[0], false);
+ TargetPhrase *tp = new TargetPhrase(this);
+ Word &word = tp->AddWord();
+ word.CreateFromString(Output, m_output, toks[0], false);
- float score = Scan<float>(toks[1]);
- tp->GetScoreBreakdown().PlusEquals(this, score);
+ float score = Scan<float>(toks[1]);
+ tp->GetScoreBreakdown().PlusEquals(this, score);
- // score of all other ff when this rule is being loaded
- tp->Evaluate(sourcePhrase, GetFeaturesToApply());
+ // score of all other ff when this rule is being loaded
+ tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
- ret.push_back(tp);
- }
+ ret.push_back(tp);
+ }
- outStream.close();
+ outStream.close();
return ret;
}
@@ -145,7 +146,7 @@ ChartRuleLookupManager* PhraseDictionaryTransliteration::CreateRuleLookupManager
const ChartCellCollectionBase &cellCollection,
std::size_t /*maxChartSpan*/)
{
- return NULL;
+ return NULL;
//return new ChartRuleLookupManagerSkeleton(parser, cellCollection, *this);
}
@@ -154,17 +155,17 @@ PhraseDictionaryTransliteration::
SetParameter(const std::string& key, const std::string& value)
{
if (key == "moses-dir") {
- m_mosesDir = value;
+ m_mosesDir = value;
} else if (key == "script-dir") {
- m_scriptDir = value;
+ m_scriptDir = value;
} else if (key == "external-dir") {
- m_externalDir = value;
+ m_externalDir = value;
} else if (key == "input-lang") {
- m_inputLang = value;
+ m_inputLang = value;
} else if (key == "output-lang") {
- m_outputLang = value;
+ m_outputLang = value;
} else {
- PhraseDictionary::SetParameter(key, value);
+ PhraseDictionary::SetParameter(key, value);
}
}
diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp
index eec8f0403..c8b7cb5d2 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTree.cpp
@@ -3,6 +3,7 @@
#include "moses/FeatureVector.h"
#include "moses/TranslationModel/PhraseDictionaryTree.h"
#include "util/exception.hh"
+#include "moses/StaticData.h"
#include <map>
#include <sstream>
@@ -233,7 +234,8 @@ public:
typedef PhraseDictionaryTree::PrefixPtr PPtr;
void GetTargetCandidates(PPtr p,TgtCands& tgtCands) {
- UTIL_THROW_IF2(p == NULL, "Error");
+ UTIL_THROW_IF2(p == 0L, "Error");
+ // UTIL_THROW_IF2(p == NULL, "Error");
if(p.imp->isRoot()) return;
OFF_T tCandOffset=p.imp->ptr()->getData(p.imp->idx);
@@ -278,7 +280,8 @@ public:
}
PPtr Extend(PPtr p,const std::string& w) {
- UTIL_THROW_IF2(p == NULL, "Error");
+ UTIL_THROW_IF2(p == 0L, "Error");
+ // UTIL_THROW_IF2(p == NULL, "Error");
if(w.empty() || w==EPSILON) return p;
@@ -349,8 +352,8 @@ int PDTimp::Read(const std::string& fn)
sv.Read(ifsv);
tv.Read(iftv);
- TRACE_ERR("binary phrasefile loaded, default OFF_T: "<<PTF::getDefault()
- <<"\n");
+ VERBOSE(1,"binary phrasefile loaded, default OFF_T: "
+ <<PTF::getDefault() <<"\n");
return 1;
}
@@ -380,8 +383,8 @@ PhraseDictionaryTree::PhraseDictionaryTree()
: imp(new PDTimp)
{
if(sizeof(OFF_T)!=8) {
- UTIL_THROW2("ERROR: size of type 'OFF_T' has to be 64 bit!\n"
- "In gcc, use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n");
+ UTIL_THROW2("ERROR: size of type 'OFF_T' has to be 64 bit!\n"
+ "In gcc, use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n");
}
}
@@ -500,7 +503,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
// init numElement
numElement = tokens.size();
UTIL_THROW_IF2(numElement < (PrintWordAlignment()?4:3),
- "Format error");
+ "Format error");
}
if (tokens.size() != numElement) {
@@ -549,8 +552,8 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
PSA::Data& d=psa->insert(f);
if(d==InvalidOffT) d=fTell(ot);
else {
- UTIL_THROW2("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '"
- <<line);
+ UTIL_THROW2("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '"
+ <<line);
}
}
@@ -559,8 +562,8 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
if (!sparseFeatureString.empty()) {
std::vector<std::string> sparseTokens = Tokenize(sparseFeatureString);
if (sparseTokens.size() % 2 != 0) {
- UTIL_THROW2("ERROR: incorrectly formatted sparse feature string: " <<
- sparseFeatureString);
+ UTIL_THROW2("ERROR: incorrectly formatted sparse feature string: " <<
+ sparseFeatureString);
}
for (size_t i = 0; i < sparseTokens.size(); i+=2) {
fnames.push_back(imp->tv.add(sparseTokens[i]));
@@ -601,13 +604,13 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
PSA::Data& d=psa->insert(f);
if(d==InvalidOffT) d=fTell(ot);
else {
- UTIL_THROW2("ERROR: xsource phrase already inserted (B)!\nline(" << lnc << "): '"
- <<line);
+ UTIL_THROW2("ERROR: xsource phrase already inserted (B)!\nline(" << lnc << "): '"
+ <<line);
}
}
tgtCands.push_back(TgtCand(e,sc, alignmentString));
UTIL_THROW_IF2(currFirstWord == InvalidLabelId,
- "Uninitialize word");
+ "Uninitialize word");
tgtCands.back().SetFeatures(fnames, fvalues);
}
if (PrintWordAlignment())
@@ -660,7 +663,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
int PhraseDictionaryTree::Read(const std::string& fn)
{
- TRACE_ERR("size of OFF_T "<<sizeof(OFF_T)<<"\n");
+ VERBOSE(1,"size of OFF_T "<<sizeof(OFF_T)<<"\n");
return imp->Read(fn);
}
diff --git a/moses/TranslationModel/ProbingPT/Jamfile b/moses/TranslationModel/ProbingPT/Jamfile
new file mode 100644
index 000000000..d30ae3486
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/Jamfile
@@ -0,0 +1,13 @@
+local current = "" ;
+local includes = ;
+if [ option.get "with-probing-pt" : : "yes" ]
+{
+ fakelib ProbingPT : [ glob *.cpp ] ../..//headers : $(includes) <dependency>$(PT-LOG) : : $(includes) ;
+}
+else {
+ fakelib ProbingPT ;
+}
+
+path-constant PT-LOG : bin/pt.log ;
+update-if-changed $(PT-LOG) $(current) ;
+
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
new file mode 100644
index 000000000..b854c8c02
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
@@ -0,0 +1,231 @@
+// vim:tabstop=2
+#include "ProbingPT.h"
+#include "moses/StaticData.h"
+#include "moses/FactorCollection.h"
+#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
+#include "quering.hh"
+
+using namespace std;
+
+namespace Moses
+{
+ProbingPT::ProbingPT(const std::string &line)
+: PhraseDictionary(line)
+,m_engine(NULL)
+{
+ ReadParameters();
+
+ assert(m_input.size() == 1);
+ assert(m_output.size() == 1);
+}
+
+ProbingPT::~ProbingPT()
+{
+ delete m_engine;
+}
+
+void ProbingPT::Load()
+{
+ SetFeaturesToApply();
+
+ m_engine = new QueryEngine(m_filePath.c_str());
+
+ m_unkId = 456456546456;
+
+ // source vocab
+ const std::map<uint64_t, std::string> &sourceVocab = m_engine->getSourceVocab();
+ std::map<uint64_t, std::string>::const_iterator iterSource;
+ for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) {
+ const string &wordStr = iterSource->second;
+ const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
+
+ uint64_t probingId = iterSource->first;
+
+ SourceVocabMap::value_type entry(factor, probingId);
+ m_sourceVocabMap.insert(entry);
+
+ }
+
+ // target vocab
+ const std::map<unsigned int, std::string> &probingVocab = m_engine->getVocab();
+ std::map<unsigned int, std::string>::const_iterator iter;
+ for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) {
+ const string &wordStr = iter->second;
+ const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
+
+ unsigned int probingId = iter->first;
+
+ TargetVocabMap::value_type entry(factor, probingId);
+ m_vocabMap.insert(entry);
+
+ }
+}
+
+void ProbingPT::InitializeForInput(InputType const& source)
+{
+ ReduceCache();
+}
+
+void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
+{
+ CacheColl &cache = GetCache();
+
+ InputPathList::const_iterator iter;
+ for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
+ InputPath &inputPath = **iter;
+ const Phrase &sourcePhrase = inputPath.GetPhrase();
+
+ if (sourcePhrase.GetSize() > StaticData::Instance().GetMaxPhraseLength()) {
+ continue;
+ }
+
+ TargetPhraseCollection *tpColl = CreateTargetPhrase(sourcePhrase);
+
+ // add target phrase to phrase-table cache
+ size_t hash = hash_value(sourcePhrase);
+ std::pair<const TargetPhraseCollection*, clock_t> value(tpColl, clock());
+ cache[hash] = value;
+
+ inputPath.SetTargetPhrases(*this, tpColl, NULL);
+ }
+}
+
+std::vector<uint64_t> ProbingPT::ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const
+{
+ size_t size = sourcePhrase.GetSize();
+ std::vector<uint64_t> ret(size);
+ for (size_t i = 0; i < size; ++i) {
+ const Factor *factor = sourcePhrase.GetFactor(i, m_input[0]);
+ uint64_t probingId = GetSourceProbingId(factor);
+ if (probingId == m_unkId) {
+ ok = false;
+ return ret;
+ }
+ else {
+ ret[i] = probingId;
+ }
+ }
+
+ ok = true;
+ return ret;
+}
+
+TargetPhraseCollection *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
+{
+ // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
+ assert(sourcePhrase.GetSize());
+
+ bool ok;
+ vector<uint64_t> probingSource = ConvertToProbingSourcePhrase(sourcePhrase, ok);
+ if (!ok) {
+ // source phrase contains a word unknown in the pt.
+ // We know immediately there's no translation for it
+ return NULL;
+ }
+
+ std::pair<bool, std::vector<target_text> > query_result;
+
+ TargetPhraseCollection *tpColl = NULL;
+
+ //Actual lookup
+ query_result = m_engine->query(probingSource);
+
+ if (query_result.first) {
+ //m_engine->printTargetInfo(query_result.second);
+ tpColl = new TargetPhraseCollection();
+
+ const std::vector<target_text> &probingTargetPhrases = query_result.second;
+ for (size_t i = 0; i < probingTargetPhrases.size(); ++i) {
+ const target_text &probingTargetPhrase = probingTargetPhrases[i];
+ TargetPhrase *tp = CreateTargetPhrase(sourcePhrase, probingTargetPhrase);
+
+ tpColl->Add(tp);
+ }
+
+ tpColl->Prune(true, m_tableLimit);
+ }
+
+ return tpColl;
+}
+
+TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const
+{
+ const std::vector<unsigned int> &probingPhrase = probingTargetPhrase.target_phrase;
+ size_t size = probingPhrase.size();
+
+ TargetPhrase *tp = new TargetPhrase(this);
+
+ // words
+ for (size_t i = 0; i < size; ++i) {
+ uint64_t probingId = probingPhrase[i];
+ const Factor *factor = GetTargetFactor(probingId);
+ assert(factor);
+
+ Word &word = tp->AddWord();
+ word.SetFactor(m_output[0], factor);
+ }
+
+ // score for this phrase table
+ vector<float> scores = probingTargetPhrase.prob;
+ std::transform(scores.begin(), scores.end(), scores.begin(),TransformScore);
+ tp->GetScoreBreakdown().PlusEquals(this, scores);
+
+ // alignment
+ /*
+ const std::vector<unsigned char> &alignments = probingTargetPhrase.word_all1;
+
+ AlignmentInfo &aligns = tp->GetAlignTerm();
+ for (size_t i = 0; i < alignS.size(); i += 2 ) {
+ aligns.Add((size_t) alignments[i], (size_t) alignments[i+1]);
+ }
+ */
+
+ // score of all other ff when this rule is being loaded
+ tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
+ return tp;
+}
+
+const Factor *ProbingPT::GetTargetFactor(uint64_t probingId) const
+{
+ TargetVocabMap::right_map::const_iterator iter;
+ iter = m_vocabMap.right.find(probingId);
+ if (iter != m_vocabMap.right.end()) {
+ return iter->second;
+ }
+ else {
+ // not in mapping. Must be UNK
+ return NULL;
+ }
+}
+
+uint64_t ProbingPT::GetSourceProbingId(const Factor *factor) const
+{
+ SourceVocabMap::left_map::const_iterator iter;
+ iter = m_sourceVocabMap.left.find(factor);
+ if (iter != m_sourceVocabMap.left.end()) {
+ return iter->second;
+ }
+ else {
+ // not in mapping. Must be UNK
+ return m_unkId;
+ }
+}
+
+ChartRuleLookupManager *ProbingPT::CreateRuleLookupManager(
+ const ChartParser &,
+ const ChartCellCollectionBase &,
+ std::size_t)
+{
+ abort();
+ return NULL;
+}
+
+TO_STRING_BODY(ProbingPT);
+
+// friend
+ostream& operator<<(ostream& out, const ProbingPT& phraseDict)
+{
+ return out;
+}
+
+}
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h
new file mode 100644
index 000000000..b879760cb
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.h
@@ -0,0 +1,59 @@
+
+#pragma once
+
+#include <boost/bimap.hpp>
+#include "../PhraseDictionary.h"
+
+class QueryEngine;
+class target_text;
+
+namespace Moses
+{
+class ChartParser;
+class ChartCellCollectionBase;
+class ChartRuleLookupManager;
+
+class ProbingPT : public PhraseDictionary
+{
+ friend std::ostream& operator<<(std::ostream&, const ProbingPT&);
+
+public:
+ ProbingPT(const std::string &line);
+ ~ProbingPT();
+
+ void Load();
+
+ void InitializeForInput(InputType const& source);
+
+ // for phrase-based model
+ void GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const;
+
+ // for syntax/hiero model (CKY+ decoding)
+ virtual ChartRuleLookupManager *CreateRuleLookupManager(
+ const ChartParser &,
+ const ChartCellCollectionBase &,
+ std::size_t);
+
+ TO_STRING();
+
+
+protected:
+ QueryEngine *m_engine;
+
+ typedef boost::bimap<const Factor *, uint64_t> SourceVocabMap;
+ mutable SourceVocabMap m_sourceVocabMap;
+
+ typedef boost::bimap<const Factor *, unsigned int> TargetVocabMap;
+ mutable TargetVocabMap m_vocabMap;
+
+ TargetPhraseCollection *CreateTargetPhrase(const Phrase &sourcePhrase) const;
+ TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const;
+ const Factor *GetTargetFactor(uint64_t probingId) const;
+ uint64_t GetSourceProbingId(const Factor *factor) const;
+
+ std::vector<uint64_t> ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const;
+
+ uint64_t m_unkId;
+};
+
+} // namespace Moses
diff --git a/moses/TranslationModel/ProbingPT/hash.cpp b/moses/TranslationModel/ProbingPT/hash.cpp
new file mode 100644
index 000000000..1049292b1
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/hash.cpp
@@ -0,0 +1,27 @@
+#include "hash.hh"
+
+uint64_t getHash(StringPiece text) {
+ std::size_t len = text.size();
+ uint64_t key = util::MurmurHashNative(text.data(), len);
+ return key;
+}
+
+std::vector<uint64_t> getVocabIDs(StringPiece textin){
+ //Tokenize
+ std::vector<uint64_t> output;
+
+ util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
+
+ while(it){
+ output.push_back(getHash(*it));
+ it++;
+ }
+
+ return output;
+}
+
+uint64_t getVocabID(std::string candidate) {
+ std::size_t len = candidate.length();
+ uint64_t key = util::MurmurHashNative(candidate.c_str(), len);
+ return key;
+} \ No newline at end of file
diff --git a/moses/TranslationModel/ProbingPT/hash.hh b/moses/TranslationModel/ProbingPT/hash.hh
new file mode 100644
index 000000000..a4fcd6330
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/hash.hh
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "util/string_piece.hh"
+#include "util/murmur_hash.hh"
+#include "util/string_piece.hh" //Tokenization and work with StringPiece
+#include "util/tokenize_piece.hh"
+#include <vector>
+
+//Gets the MurmurmurHash for give string
+uint64_t getHash(StringPiece text);
+
+std::vector<uint64_t> getVocabIDs(StringPiece textin);
+
+uint64_t getVocabID(std::string candidate); \ No newline at end of file
diff --git a/moses/TranslationModel/ProbingPT/huffmanish.cpp b/moses/TranslationModel/ProbingPT/huffmanish.cpp
new file mode 100644
index 000000000..eea0a7c53
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/huffmanish.cpp
@@ -0,0 +1,414 @@
+#include "huffmanish.hh"
+
+Huffman::Huffman (const char * filepath) {
+ //Read the file
+ util::FilePiece filein(filepath);
+
+ //Init uniq_lines to zero;
+ uniq_lines = 0;
+
+ line_text prev_line; //Check for unique lines.
+ int num_lines = 0 ;
+
+ while (true){
+ line_text new_line;
+
+ num_lines++;
+
+ try {
+ //Process line read
+ new_line = splitLine(filein.ReadLine());
+ count_elements(new_line); //Counts the number of elements, adds new and increments counters.
+
+ } catch (util::EndOfFileException e){
+ std::cerr << "Unique entries counted: ";
+ break;
+ }
+
+ if (new_line.source_phrase == prev_line.source_phrase){
+ continue;
+ } else {
+ uniq_lines++;
+ prev_line = new_line;
+ }
+ }
+
+ std::cerr << uniq_lines << std::endl;
+}
+
+void Huffman::count_elements(line_text linein){
+ //For target phrase:
+ util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
+ while (it) {
+ //Check if we have that entry
+ std::map<std::string, unsigned int>::iterator mapiter;
+ mapiter = target_phrase_words.find(it->as_string());
+
+ if (mapiter != target_phrase_words.end()){
+ //If the element is found, increment the count.
+ mapiter->second++;
+ } else {
+ //Else create a new entry;
+ target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
+ }
+ it++;
+ }
+
+ //For word allignment 1
+ std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
+ std::vector<unsigned char> numbers = splitWordAll1(linein.word_all1);
+ mapiter3 = word_all1.find(numbers);
+
+ if (mapiter3 != word_all1.end()){
+ //If the element is found, increment the count.
+ mapiter3->second++;
+ } else {
+ //Else create a new entry;
+ word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
+ }
+
+}
+
+//Assigns huffman values for each unique element
+void Huffman::assign_values() {
+ //First create vectors for all maps so that we could sort them later.
+
+ //Create a vector for target phrases
+ for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
+ target_phrase_words_counts.push_back(*it);
+ }
+ //Sort it
+ std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
+
+ //Create a vector for word allignments 1
+ for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
+ word_all1_counts.push_back(*it);
+ }
+ //Sort it
+ std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
+
+
+ //Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
+ unsigned int i = 1; //huffman code
+ for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
+ it != target_phrase_words_counts.end(); it++){
+ target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
+ i++; //Go to the next huffman code
+ }
+
+ i = 1; //Reset i for the next map
+ for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
+ it != word_all1_counts.end(); it++){
+ word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
+ i++; //Go to the next huffman code
+ }
+
+ //After lookups are produced, clear some memory usage of objects not needed anymore.
+ target_phrase_words.clear();
+ word_all1.clear();
+
+ target_phrase_words_counts.clear();
+ word_all1_counts.clear();
+
+ std::cerr << "Finished generating huffman codes." << std::endl;
+
+}
+
+void Huffman::serialize_maps(const char * dirname){
+ //Note that directory name should exist.
+ std::string basedir(dirname);
+ std::string target_phrase_path(basedir + "/target_phrases");
+ std::string probabilities_path(basedir + "/probs");
+ std::string word_all1_path(basedir + "/Wall1");
+
+ //Target phrase
+ std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
+ boost::archive::text_oarchive oarch(os);
+ oarch << lookup_target_phrase;
+ os.close();
+
+ //Word all1
+ std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
+ boost::archive::text_oarchive oarch2(os2);
+ oarch2 << lookup_word_all1;
+ os2.close();
+}
+
+std::vector<unsigned char> Huffman::full_encode_line(line_text line){
+ return vbyte_encode_line((encode_line(line)));
+}
+
+std::vector<unsigned int> Huffman::encode_line(line_text line){
+ std::vector<unsigned int> retvector;
+
+ //Get target_phrase first.
+ util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
+ while (it) {
+ retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
+ it++;
+ }
+ //Add a zero;
+ retvector.push_back(0);
+
+ //Get probabilities. Reinterpreting the float bytes as unsgined int.
+ util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
+ while (probit) {
+ //Sometimes we have too big floats to handle, so first convert to double
+ double tempnum = atof(probit->data());
+ float num = (float)tempnum;
+ retvector.push_back(reinterpret_float(&num));
+ probit++;
+ }
+ //Add a zero;
+ retvector.push_back(0);
+
+
+ //Get Word allignments
+ retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_all1))->second);
+ retvector.push_back(0);
+
+ return retvector;
+}
+
+void Huffman::produce_lookups(){
+ //basically invert every map that we have
+ for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
+ lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
+ }
+
+ for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
+ lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
+ }
+
+}
+
+HuffmanDecoder::HuffmanDecoder (const char * dirname){
+ //Read the maps from disk
+
+ //Note that directory name should exist.
+ std::string basedir(dirname);
+ std::string target_phrase_path(basedir + "/target_phrases");
+ std::string word_all1_path(basedir + "/Wall1");
+
+ //Target phrases
+ std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
+ boost::archive::text_iarchive iarch(is);
+ iarch >> lookup_target_phrase;
+ is.close();
+
+ //Word allignment 1
+ std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
+ boost::archive::text_iarchive iarch2(is2);
+ iarch2 >> lookup_word_all1;
+ is2.close();
+
+}
+
+HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target,
+ std::map<unsigned int, std::vector<unsigned char> > * lookup_word1) {
+ lookup_target_phrase = *lookup_target;
+ lookup_word_all1 = *lookup_word1;
+}
+
+std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines){
+ std::vector<target_text> retvector; //All target phrases
+ std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines); //All decoded lines
+ std::vector<unsigned int>::iterator it = decoded_lines.begin(); //Iterator for them
+ std::vector<unsigned int> current_target_phrase; //Current target phrase decoded
+
+ short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
+ while(it != decoded_lines.end()){
+ if (zero_count == 3) {
+ //We have finished with this entry, decode it, and add it to the retvector.
+ retvector.push_back(decode_line(current_target_phrase));
+ current_target_phrase.clear(); //Clear the current target phrase and the zero_count
+ zero_count = 0; //So that we can reuse them for the next target phrase
+ }
+ //Add to the next target_phrase, number by number.
+ current_target_phrase.push_back(*it);
+ if (*it == 0) {
+ zero_count++;
+ }
+ it++; //Go to the next word/symbol
+ }
+ //Don't forget the last remaining line!
+ if (zero_count == 3) {
+ //We have finished with this entry, decode it, and add it to the retvector.
+ retvector.push_back(decode_line(current_target_phrase));
+ current_target_phrase.clear(); //Clear the current target phrase and the zero_count
+ zero_count = 0; //So that we can reuse them for the next target phrase
+ }
+
+ return retvector;
+
+}
+
+target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input){
+ //demo decoder
+ target_text ret;
+ //Split everything
+ std::vector<unsigned int> target_phrase;
+ std::vector<unsigned int> probs;
+ unsigned int wAll;
+
+ //Split the line into the proper arrays
+ short num_zeroes = 0;
+ int counter = 0;
+ while (num_zeroes < 3){
+ unsigned int num = input[counter];
+ if (num == 0) {
+ num_zeroes++;
+ } else if (num_zeroes == 0){
+ target_phrase.push_back(num);
+ } else if (num_zeroes == 1){
+ probs.push_back(num);
+ } else if (num_zeroes == 2){
+ wAll = num;
+ }
+ counter++;
+ }
+
+ ret.target_phrase = target_phrase;
+ ret.word_all1 = lookup_word_all1.find(wAll)->second;
+
+ //Decode probabilities
+ for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++){
+ ret.prob.push_back(reinterpret_uint(&(*it)));
+ }
+
+ return ret;
+
+}
+
+inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id) {
+ return lookup_target_phrase.find(id)->second;
+}
+
+std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids){
+ std::string returnstring;
+ for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++){
+ returnstring.append(getTargetWordFromID(*it) + " ");
+ }
+
+ return returnstring;
+}
+
+inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase) {
+ return lookup_target_phrase->find(id)->second;
+}
+
+std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase) {
+ std::string returnstring;
+ for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++){
+ returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
+ }
+
+ return returnstring;
+}
+
+/*Those functions are used to more easily store the floats in the binary phrase table
+ We convert the float unsinged int so that it is the same as our other values and we can
+ apply variable byte encoding on top of it.*/
+
+inline unsigned int reinterpret_float(float * num){
+ unsigned int * converted_num;
+ converted_num = reinterpret_cast<unsigned int *>(num);
+ return *converted_num;
+}
+
+inline float reinterpret_uint(unsigned int * num){
+ float * converted_num;
+ converted_num = reinterpret_cast<float *>(num);
+ return *converted_num;
+}
+
+/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
+and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
+This is highly optimized version with unrolled loop */
+inline std::vector<unsigned char> vbyte_encode(unsigned int num){
+ //Determine how many bytes we are going to take.
+ short size;
+ std::vector<unsigned char> byte_vector;
+
+ if (num < 0x00000080U) {
+ size = 1;
+ byte_vector.reserve(size);
+ goto b1;
+ }
+ if (num < 0x00004000U) {
+ size = 2;
+ byte_vector.reserve(size);
+ goto b2;
+ }
+ if (num < 0x00200000U) {
+ size = 3;
+ byte_vector.reserve(size);
+ goto b3;
+ }
+ if (num < 0x10000000U) {
+ size = 4;
+ byte_vector.reserve(size);
+ goto b4;
+ }
+ size = 5;
+ byte_vector.reserve(size);
+
+
+ //Now proceed with the encoding.
+ byte_vector.push_back((num & 0x7f) | 0x80);
+ num >>= 7;
+b4:
+ byte_vector.push_back((num & 0x7f) | 0x80);
+ num >>= 7;
+b3:
+ byte_vector.push_back((num & 0x7f) | 0x80);
+ num >>= 7;
+b2:
+ byte_vector.push_back((num & 0x7f) | 0x80);
+ num >>= 7;
+b1:
+ byte_vector.push_back(num);
+
+ return byte_vector;
+}
+
+std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line){
+ std::vector<unsigned int> huffman_line;
+ std::vector<unsigned char> current_num;
+
+ for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++){
+ current_num.push_back(*it);
+ if ((*it >> 7) != 1) {
+ //We don't have continuation in the next bit
+ huffman_line.push_back(bytes_to_int(current_num));
+ current_num.clear();
+ }
+ }
+ return huffman_line;
+}
+
+inline unsigned int bytes_to_int(std::vector<unsigned char> number){
+ unsigned int retvalue = 0;
+ std::vector<unsigned char>::iterator it = number.begin();
+ unsigned char shift = 0; //By how many bits to shift
+
+ while (it != number.end()) {
+ retvalue |= (*it & 0x7f) << shift;
+ shift += 7;
+ it++;
+ }
+
+ return retvalue;
+}
+
+std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line) {
+ std::vector<unsigned char> retvec;
+
+ //For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
+ for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++){
+ std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
+ retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
+ }
+
+ return retvec;
+}
diff --git a/moses/TranslationModel/ProbingPT/huffmanish.hh b/moses/TranslationModel/ProbingPT/huffmanish.hh
new file mode 100644
index 000000000..3116484e9
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/huffmanish.hh
@@ -0,0 +1,110 @@
+#pragma once
+
+//Huffman encodes a line and also produces the vocabulary ids
+#include "hash.hh"
+#include "line_splitter.hh"
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <boost/serialization/serialization.hpp>
+#include <boost/serialization/vector.hpp>
+#include <boost/serialization/map.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <boost/archive/text_oarchive.hpp>
+
+//Sorting for the second
+struct sort_pair {
+ bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) {
+ return left.second > right.second; //This puts biggest numbers first.
+ }
+};
+
+struct sort_pair_vec {
+ bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) {
+ return left.second > right.second; //This puts biggest numbers first.
+ }
+};
+
+class Huffman {
+ unsigned long uniq_lines; //Unique lines in the file.
+
+ //Containers used when counting the occurence of a given phrase
+ std::map<std::string, unsigned int> target_phrase_words;
+ std::map<std::vector<unsigned char>, unsigned int> word_all1;
+
+ //Same containers as vectors, for sorting
+ std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts;
+ std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts;
+
+ //Huffman maps
+ std::map<std::string, unsigned int> target_phrase_huffman;
+ std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman;
+
+ //inverted maps
+ std::map<unsigned int, std::string> lookup_target_phrase;
+ std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
+
+ public:
+ Huffman (const char *);
+ void count_elements (line_text line);
+ void assign_values();
+ void serialize_maps(const char * dirname);
+ void produce_lookups();
+
+ std::vector<unsigned int> encode_line(line_text line);
+
+ //encode line + variable byte ontop
+ std::vector<unsigned char> full_encode_line(line_text line);
+
+ //Getters
+ const std::map<unsigned int, std::string> get_target_lookup_map() const{
+ return lookup_target_phrase;
+ }
+ const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const{
+ return lookup_word_all1;
+ }
+
+ unsigned long getUniqLines() {
+ return uniq_lines;
+ }
+};
+
+class HuffmanDecoder {
+ std::map<unsigned int, std::string> lookup_target_phrase;
+ std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
+
+public:
+ HuffmanDecoder (const char *);
+ HuffmanDecoder (std::map<unsigned int, std::string> *, std::map<unsigned int, std::vector<unsigned char> > *);
+
+ //Getters
+ const std::map<unsigned int, std::string> get_target_lookup_map() const{
+ return lookup_target_phrase;
+ }
+ const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const{
+ return lookup_word_all1;
+ }
+
+ inline std::string getTargetWordFromID(unsigned int id);
+
+ std::string getTargetWordsFromIDs(std::vector<unsigned int> ids);
+
+ target_text decode_line (std::vector<unsigned int> input);
+
+ //Variable byte decodes a all target phrases contained here and then passes them to decode_line
+ std::vector<target_text> full_decode_line (std::vector<unsigned char> lines);
+};
+
+std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase);
+
+inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase);
+
+inline unsigned int reinterpret_float(float * num);
+
+inline float reinterpret_uint(unsigned int * num);
+
+std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line);
+inline std::vector<unsigned char> vbyte_encode(unsigned int num);
+std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line);
+inline unsigned int bytes_to_int(std::vector<unsigned char> number);
diff --git a/moses/TranslationModel/ProbingPT/line_splitter.cpp b/moses/TranslationModel/ProbingPT/line_splitter.cpp
new file mode 100644
index 000000000..f50090e4c
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/line_splitter.cpp
@@ -0,0 +1,52 @@
+#include "line_splitter.hh"
+
+line_text splitLine(StringPiece textin) {
+ const char delim[] = " ||| ";
+ line_text output;
+
+ //Tokenize
+ util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
+ //Get source phrase
+ output.source_phrase = *it;
+ it++;
+ //Get target_phrase
+ output.target_phrase = *it;
+ it++;
+ //Get probabilities
+ output.prob = *it;
+ it++;
+ //Get WordAllignment 1
+ output.word_all1 = *it;
+ it++;
+ //Get WordAllignment 2
+ output.word_all2 = *it;
+
+ return output;
+}
+
+std::vector<unsigned char> splitWordAll1(StringPiece textin){
+ const char delim[] = " ";
+ const char delim2[] = "-";
+ std::vector<unsigned char> output;
+
+ //Split on space
+ util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
+
+ //For each int
+ while (it) {
+ //Split on dash (-)
+ util::TokenIter<util::MultiCharacter> itInner(*it, util::MultiCharacter(delim2));
+
+ //Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
+ //2 and 3 for second etc. Use unsigned char instead of int to save space, as
+ //word allignments are all very small numbers that fit in a single byte
+ output.push_back((unsigned char)(atoi(itInner->data())));
+ itInner++;
+ output.push_back((unsigned char)(atoi(itInner->data())));
+ it++;
+ }
+
+ return output;
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/line_splitter.hh b/moses/TranslationModel/ProbingPT/line_splitter.hh
new file mode 100644
index 000000000..c699a28c0
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/line_splitter.hh
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/file_piece.hh"
+#include <vector>
+#include <stdlib.h> //atof
+#include "util/string_piece.hh" //Tokenization and work with StringPiece
+#include "util/tokenize_piece.hh"
+#include <vector>
+
+//Struct for holding processed line
+struct line_text {
+ StringPiece source_phrase;
+ StringPiece target_phrase;
+ StringPiece prob;
+ StringPiece word_all1;
+ StringPiece word_all2;
+};
+
+//Struct for holding processed line
+struct target_text {
+ std::vector<unsigned int> target_phrase;
+ std::vector<float> prob;
+ std::vector<unsigned char> word_all1;
+};
+
+//Ask if it's better to have it receive a pointer to a line_text struct
+line_text splitLine(StringPiece textin);
+
+std::vector<unsigned char> splitWordAll1(StringPiece textin);
diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp
new file mode 100644
index 000000000..35cb9e538
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp
@@ -0,0 +1,32 @@
+#include "probing_hash_utils.hh"
+
+//Read table from disk, return memory map location
+char * readTable(const char * filename, size_t size) {
+ //Initial position of the file is the end of the file, thus we know the size
+ int fd;
+ char * map;
+
+ fd = open(filename, O_RDONLY);
+ if (fd == -1) {
+ perror("Error opening file for reading");
+ exit(EXIT_FAILURE);
+ }
+
+ map = (char *)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
+
+ if (map == MAP_FAILED) {
+ close(fd);
+ perror("Error mmapping the file");
+ exit(EXIT_FAILURE);
+ }
+
+ return map;
+}
+
+
+void serialize_table(char *mem, size_t size, const char * filename){
+ std::ofstream os (filename, std::ios::binary);
+ os.write((const char*)&mem[0], size);
+ os.close();
+
+} \ No newline at end of file
diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh
new file mode 100644
index 000000000..964097829
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "util/probing_hash_table.hh"
+
+#include <sys/mman.h>
+#include <boost/functional/hash.hpp>
+#include <fcntl.h>
+#include <fstream>
+
+
+//Hash table entry
+struct Entry {
+ uint64_t key;
+ typedef uint64_t Key;
+ unsigned int bytes_toread;
+
+ uint64_t GetKey() const {
+ return key;
+ }
+
+ void SetKey(uint64_t to) {
+ key = to;
+ }
+
+ uint64_t GetValue() const {
+ return value;
+ }
+
+ uint64_t value;
+};
+
+//Define table
+typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
+
+void serialize_table(char *mem, size_t size, const char * filename);
+
+char * readTable(const char * filename, size_t size);
diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp
new file mode 100644
index 000000000..18efed917
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/quering.cpp
@@ -0,0 +1,174 @@
+#include "quering.hh"
+
+unsigned char * read_binary_file(const char * filename, size_t filesize){
+ //Get filesize
+ int fd;
+ unsigned char * map;
+
+ fd = open(filename, O_RDONLY);
+
+ if (fd == -1) {
+ perror("Error opening file for reading");
+ exit(EXIT_FAILURE);
+ }
+
+ map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0);
+ if (map == MAP_FAILED) {
+ close(fd);
+ perror("Error mmapping the file");
+ exit(EXIT_FAILURE);
+ }
+
+ return map;
+}
+
+QueryEngine::QueryEngine(const char * filepath) : decoder(filepath){
+
+ //Create filepaths
+ std::string basepath(filepath);
+ std::string path_to_hashtable = basepath + "/probing_hash.dat";
+ std::string path_to_data_bin = basepath + "/binfile.dat";
+ std::string path_to_source_vocabid = basepath + "/source_vocabids";
+
+ ///Source phrase vocabids
+ read_map(&source_vocabids, path_to_source_vocabid.c_str());
+
+ //Target phrase vocabIDs
+ vocabids = decoder.get_target_lookup_map();
+
+ //Read config file
+ std::string line;
+ std::ifstream config ((basepath + "/config").c_str());
+ getline(config, line);
+ int tablesize = atoi(line.c_str()); //Get tablesize.
+ config.close();
+
+ //Mmap binary table
+ struct stat filestatus;
+ stat(path_to_data_bin.c_str(), &filestatus);
+ binary_filesize = filestatus.st_size;
+ binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize);
+
+ //Read hashtable
+ size_t table_filesize = Table::Size(tablesize, 1.2);
+ mem = readTable(path_to_hashtable.c_str(), table_filesize);
+ Table table_init(mem, table_filesize);
+ table = table_init;
+
+ std::cerr << "Initialized successfully! " << std::endl;
+}
+
+QueryEngine::~QueryEngine(){
+ //Clear mmap content from memory.
+ munmap(binary_mmaped, binary_filesize);
+ munmap(mem, table_filesize);
+
+}
+
+std::pair<bool, std::vector<target_text> > QueryEngine::query(std::vector<uint64_t> source_phrase){
+ bool found;
+ std::vector<target_text> translation_entries;
+ const Entry * entry;
+ //TOO SLOW
+ //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
+ uint64_t key = 0;
+ for (int i = 0; i < source_phrase.size(); i++){
+ key += source_phrase[i];
+ }
+
+
+ found = table.Find(key, entry);
+
+ if (found){
+ //The phrase that was searched for was found! We need to get the translation entries.
+ //We will read the largest entry in bytes and then filter the unnecesarry with functions
+ //from line_splitter
+ uint64_t initial_index = entry -> GetValue();
+ unsigned int bytes_toread = entry -> bytes_toread;
+
+ //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
+ std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
+ encoded_text.reserve(bytes_toread);
+ for (int i = 0; i < bytes_toread; i++){
+ encoded_text.push_back(binary_mmaped[i+initial_index]);
+ }
+
+ //Get only the translation entries necessary
+ translation_entries = decoder.full_decode_line(encoded_text);
+
+ }
+
+ std::pair<bool, std::vector<target_text> > output (found, translation_entries);
+
+ return output;
+
+}
+
+std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase){
+ bool found;
+ std::vector<target_text> translation_entries;
+ const Entry * entry;
+ //Convert source frase to VID
+ std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase);
+ //TOO SLOW
+ //uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size());
+ uint64_t key = 0;
+ for (int i = 0; i < source_phrase_vid.size(); i++){
+ key += source_phrase_vid[i];
+ }
+
+ found = table.Find(key, entry);
+
+
+ if (found){
+ //The phrase that was searched for was found! We need to get the translation entries.
+ //We will read the largest entry in bytes and then filter the unnecesarry with functions
+ //from line_splitter
+ uint64_t initial_index = entry -> GetValue();
+ unsigned int bytes_toread = entry -> bytes_toread;
+ //At the end of the file we can't readd + largest_entry cause we get a segfault.
+ std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl;
+
+ //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
+ std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
+ encoded_text.reserve(bytes_toread);
+ for (int i = 0; i < bytes_toread; i++){
+ encoded_text.push_back(binary_mmaped[i+initial_index]);
+ }
+
+ //Get only the translation entries necessary
+ translation_entries = decoder.full_decode_line(encoded_text);
+
+ }
+
+ std::pair<bool, std::vector<target_text> > output (found, translation_entries);
+
+ return output;
+
+}
+
+void QueryEngine::printTargetInfo(std::vector<target_text> target_phrases){
+ int entries = target_phrases.size();
+
+ for (int i = 0; i<entries; i++){
+ std::cout << "Entry " << i+1 << " of " << entries << ":" << std::endl;
+ //Print text
+ std::cout << getTargetWordsFromIDs(target_phrases[i].target_phrase, &vocabids) << "\t";
+
+ //Print probabilities:
+ for (int j = 0; j<target_phrases[i].prob.size(); j++){
+ std::cout << target_phrases[i].prob[j] << " ";
+ }
+ std::cout << "\t";
+
+ //Print word_all1
+ for (int j = 0; j<target_phrases[i].word_all1.size(); j++){
+ if (j%2 == 0){
+ std::cout << (short)target_phrases[i].word_all1[j] << "-";
+ }else{
+ std::cout << (short)target_phrases[i].word_all1[j] << " ";
+ }
+ }
+ std::cout << std::endl;
+ }
+}
diff --git a/moses/TranslationModel/ProbingPT/quering.hh b/moses/TranslationModel/ProbingPT/quering.hh
new file mode 100644
index 000000000..133f484ce
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/quering.hh
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "probing_hash_utils.hh"
+#include "huffmanish.hh"
+#include "hash.hh" //Includes line splitter
+#include <sys/stat.h> //For finding size of file
+#include "vocabid.hh"
+
+
+char * read_binary_file(char * filename);
+
+class QueryEngine {
+ unsigned char * binary_mmaped; //The binari phrase table file
+ std::map<unsigned int, std::string> vocabids;
+ std::map<uint64_t, std::string> source_vocabids;
+
+ Table table;
+ char *mem; //Memory for the table, necessary so that we can correctly destroy the object
+
+ HuffmanDecoder decoder;
+
+ size_t binary_filesize;
+ size_t table_filesize;
+ public:
+ QueryEngine (const char *);
+ ~QueryEngine();
+ std::pair<bool, std::vector<target_text> > query(StringPiece source_phrase);
+ std::pair<bool, std::vector<target_text> > query(std::vector<uint64_t> source_phrase);
+ void printTargetInfo(std::vector<target_text> target_phrases);
+ const std::map<unsigned int, std::string> getVocab() const
+ { return decoder.get_target_lookup_map(); }
+
+ const std::map<uint64_t, std::string> getSourceVocab() const {
+ return source_vocabids;
+ }
+
+};
+
+
diff --git a/moses/TranslationModel/ProbingPT/storing.cpp b/moses/TranslationModel/ProbingPT/storing.cpp
new file mode 100644
index 000000000..5ea0df39c
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/storing.cpp
@@ -0,0 +1,151 @@
+#include "storing.hh"
+
+BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary) {
+ binfile.reserve(10000); //Reserve part of the vector to avoid realocation
+ it = binfile.begin();
+ dist_from_start = 0; //Initialize variables
+ extra_counter = 0;
+}
+
+void BinaryFileWriter::write (std::vector<unsigned char> * bytes) {
+ binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes
+ //Keep track of the offsets
+ it += bytes->size();
+ dist_from_start = distance(binfile.begin(),it);
+ //Flush the vector to disk every once in a while so that we don't consume too much ram
+ if (dist_from_start > 9000) {
+ flush();
+ }
+}
+
+void BinaryFileWriter::flush () {
+ //Cast unsigned char to char before writing...
+ os.write((char *)&binfile[0], dist_from_start);
+ //Clear the vector:
+ binfile.clear();
+ binfile.reserve(10000);
+ extra_counter += dist_from_start; //Keep track of the total number of bytes.
+ it = binfile.begin(); //Reset iterator
+ dist_from_start = distance(binfile.begin(),it); //Reset dist from start
+}
+
+BinaryFileWriter::~BinaryFileWriter (){
+ os.close();
+ binfile.clear();
+}
+
+void createProbingPT(const char * phrasetable_path, const char * target_path){
+ //Get basepath and create directory if missing
+ std::string basepath(target_path);
+ mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+
+ //Set up huffman and serialize decoder maps.
+ Huffman huffmanEncoder(phrasetable_path); //initialize
+ huffmanEncoder.assign_values();
+ huffmanEncoder.produce_lookups();
+ huffmanEncoder.serialize_maps(target_path);
+
+ //Get uniq lines:
+ unsigned long uniq_entries = huffmanEncoder.getUniqLines();
+
+ //Source phrase vocabids
+ std::map<uint64_t, std::string> source_vocabids;
+
+ //Read the file
+ util::FilePiece filein(phrasetable_path);
+
+ //Init the probing hash table
+ size_t size = Table::Size(uniq_entries, 1.2);
+ char * mem = new char[size];
+ memset(mem, 0, size);
+ Table table(mem, size);
+
+ BinaryFileWriter binfile(basepath); //Init the binary file writer.
+
+ line_text prev_line; //Check if the source phrase of the previous line is the same
+
+ //Keep track of the size of each group of target phrases
+ uint64_t entrystartidx = 0;
+ //uint64_t line_num = 0;
+
+
+ //Read everything and processs
+ while(true){
+ try {
+ //Process line read
+ line_text line;
+ line = splitLine(filein.ReadLine());
+ //Add source phrases to vocabularyIDs
+ add_to_map(&source_vocabids, line.source_phrase);
+
+ if ((binfile.dist_from_start + binfile.extra_counter) == 0) {
+ prev_line = line; //For the first iteration assume the previous line is
+ } //The same as this one.
+
+ if (line.source_phrase != prev_line.source_phrase){
+
+ //Create a new entry even
+
+ //Create an entry for the previous source phrase:
+ Entry pesho;
+ pesho.value = entrystartidx;
+ //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
+ pesho.key = 0;
+ std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
+ for (int i = 0; i < vocabid_source.size(); i++){
+ pesho.key += vocabid_source[i];
+ }
+ pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
+
+ //Put into table
+ table.Insert(pesho);
+
+ entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
+
+ //Encode a line and write it to disk.
+ std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
+ binfile.write(&encoded_line);
+
+ //Set prevLine
+ prev_line = line;
+
+ } else{
+ //If we still have the same line, just append to it:
+ std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
+ binfile.write(&encoded_line);
+ }
+
+ } catch (util::EndOfFileException e){
+ std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl;
+ binfile.flush();
+
+ //After the final entry is constructed we need to add it to the phrase_table
+ //Create an entry for the previous source phrase:
+ Entry pesho;
+ pesho.value = entrystartidx;
+ //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
+ pesho.key = 0;
+ std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
+ for (int i = 0; i < vocabid_source.size(); i++){
+ pesho.key += vocabid_source[i];
+ }
+ pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
+ //Put into table
+ table.Insert(pesho);
+
+ break;
+ }
+ }
+
+ serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str());
+
+ serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str());
+
+ delete[] mem;
+
+ //Write configfile
+ std::ofstream configfile;
+ configfile.open((basepath + "/config").c_str());
+ configfile << uniq_entries << '\n';
+ configfile.close();
+}
diff --git a/moses/TranslationModel/ProbingPT/storing.hh b/moses/TranslationModel/ProbingPT/storing.hh
new file mode 100644
index 000000000..dfcdbcc41
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/storing.hh
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+
+#include "hash.hh" //Includes line_splitter
+#include "probing_hash_utils.hh"
+#include "huffmanish.hh"
+#include <sys/stat.h> //mkdir
+
+#include "util/file_piece.hh"
+#include "util/file.hh"
+#include "vocabid.hh"
+
+void createProbingPT(const char * phrasetable_path, const char * target_path);
+
+class BinaryFileWriter {
+ std::vector<unsigned char> binfile;
+ std::vector<unsigned char>::iterator it;
+ //Output binary
+ std::ofstream os;
+
+public:
+ unsigned int dist_from_start; //Distance from the start of the vector.
+ uint64_t extra_counter; //After we reset the counter, we still want to keep track of the correct offset, so
+
+ BinaryFileWriter (std::string);
+ ~BinaryFileWriter ();
+ void write (std::vector<unsigned char> * bytes);
+ void flush (); //Flush to disk
+
+};
diff --git a/moses/TranslationModel/ProbingPT/tests/tokenization_tests.cpp b/moses/TranslationModel/ProbingPT/tests/tokenization_tests.cpp
new file mode 100644
index 000000000..2a63242de
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/tests/tokenization_tests.cpp
@@ -0,0 +1,198 @@
+#include "line_splitter.hh"
+
+bool test_vectorinsert() {
+ StringPiece line1 = StringPiece("! ! ! ! ||| ! ! ! ! ||| 0.0804289 0.141656 0.0804289 0.443409 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 1 1");
+ StringPiece line2 = StringPiece("! ! ! ) , has ||| ! ! ! ) - , a ||| 0.0804289 0.0257627 0.0804289 0.00146736 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 5-6 ||| 1 1 1");
+ line_text output = splitLine(line1);
+ line_text output2 = splitLine(line2);
+
+ //Init container vector and iterator.
+ std::vector<char> container;
+ container.reserve(10000); //Reserve vector
+ std::vector<char>::iterator it = container.begin();
+ std::pair<std::vector<char>::iterator, int> binary_append_ret; //Return values from vector_append
+
+ //Put a value into the vector
+ binary_append_ret = vector_append(&output, &container, it, false);
+ it = binary_append_ret.first;
+ binary_append_ret = vector_append(&output2, &container, it, false);
+ it = binary_append_ret.first;
+
+ std::string test(container.begin(), container.end());
+ std::string should_be = "! ! ! ! 0.0804289 0.141656 0.0804289 0.443409 2.718 0-0 1-1 2-2 3-3 1 1 1! ! ! ) - , a 0.0804289 0.0257627 0.0804289 0.00146736 2.718 0-0 1-1 2-2 3-3 4-4 4-5 5-6 1 1 1";
+ if (test == should_be) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool probabilitiesTest(){
+ StringPiece line1 = StringPiece("0.536553 0.75961 0.634108 0.532927 2.718");
+ StringPiece line2 = StringPiece("1.42081e-05 3.91895e-09 0.0738539 0.749514 2.718");
+
+ std::vector<double> pesho;
+ bool peshobool = false;
+ bool kirobool = false;
+ std::vector<double> kiro;
+
+ pesho = splitProbabilities(line1);
+ kiro = splitProbabilities(line2);
+
+ if (pesho[0] == 0.536553 && pesho[1] == 0.75961 && pesho[2] == 0.634108 && pesho[3] == 0.532927 && pesho[4] == 2.718 && pesho.size() == 5) {
+ peshobool = true;
+ } else {
+ std::cout << "Processed: " << pesho[0] << " " << pesho[1] << " " << pesho[2] << " " << pesho[3] << " " << pesho[4] << std::endl;
+ std::cout << "Size is: " << pesho.size() << " Expected 5." << std::endl;
+ std::cout << "Expected: " << "0.536553 0.75961 0.634108 0.532927 2.718" << std::endl;
+ }
+
+ if (kiro[0] == 1.42081e-05 && kiro[1] == 3.91895e-09 && kiro[2] == 0.0738539 && kiro[3] == 0.749514 && kiro[4] == 2.718 && kiro.size() == 5) {
+ kirobool = true;
+ } else {
+ std::cout << "Processed: " << kiro[0] << " " << kiro[1] << " " << kiro[2] << " " << kiro[3] << " " << kiro[4] << std::endl;
+ std::cout << "Size is: " << kiro.size() << " Expected 5." << std::endl;
+ std::cout << "Expected: " << "1.42081e-05 3.91895e-09 0.0738539 0.749514 2.718" << std::endl;
+ }
+
+ return (peshobool && kirobool);
+}
+
+bool wordAll1test(){
+ StringPiece line1 = StringPiece("2-0 3-1 4-2 5-2");
+ StringPiece line2 = StringPiece("0-0 1-1 2-2 3-3 4-3 6-4 5-5");
+
+ std::vector<int> pesho;
+ bool peshobool = false;
+ bool kirobool = false;
+ std::vector<int> kiro;
+
+ pesho = splitWordAll1(line1);
+ kiro = splitWordAll1(line2);
+
+ if (pesho[0] == 2 && pesho[1] == 0 && pesho[2] == 3 && pesho[3] == 1 && pesho[4] == 4
+ && pesho[5] == 2 && pesho[6] == 5 && pesho[7] == 2 && pesho.size() == 8) {
+ peshobool = true;
+ } else {
+ std::cout << "Processed: " << pesho[0] << "-" << pesho[1] << " " << pesho[2] << "-" << pesho[3] << " "
+ << pesho[4] << "-" << pesho[5] << " " << pesho[6] << "-" << pesho[7] << std::endl;
+ std::cout << "Size is: " << pesho.size() << " Expected: 8." << std::endl;
+ std::cout << "Expected: " << "2-0 3-1 4-2 5-2" << std::endl;
+ }
+
+ if (kiro[0] == 0 && kiro[1] == 0 && kiro[2] == 1 && kiro[3] == 1 && kiro[4] == 2 && kiro[5] == 2
+ && kiro[6] == 3 && kiro[7] == 3 && kiro[8] == 4 && kiro[9] == 3 && kiro[10] == 6 && kiro[11] == 4
+ && kiro[12] == 5 && kiro[13] == 5 && kiro.size() == 14){
+ kirobool = true;
+ } else {
+ std::cout << "Processed: " << kiro[0] << "-" << kiro[1] << " " << kiro[2] << "-" << kiro[3] << " "
+ << kiro[4] << "-" << kiro[5] << " " << kiro[6] << "-" << kiro[7] << " " << kiro[8] << "-" << kiro[9]
+ << " " << kiro[10] << "-" << kiro[11] << " " << kiro[12] << "-" << kiro[13] << std::endl;
+ std::cout << "Size is: " << kiro.size() << " Expected: 14" << std::endl;
+ std::cout << "Expected: " << "0-0 1-1 2-2 3-3 4-3 6-4 5-5" << std::endl;
+ }
+
+ return (peshobool && kirobool);
+}
+
+bool wordAll2test(){
+ StringPiece line1 = StringPiece("4 9 1");
+ StringPiece line2 = StringPiece("3255 9 1");
+
+ std::vector<int> pesho;
+ bool peshobool = false;
+ bool kirobool = false;
+ std::vector<int> kiro;
+
+ pesho = splitWordAll2(line1);
+ kiro = splitWordAll2(line2);
+
+ if (pesho[0] == 4 && pesho[1] == 9 && pesho[2] == 1 && pesho.size() == 3){
+ peshobool = true;
+ } else {
+ std::cout << "Processed: " << pesho[0] << " " << pesho[1] << " " << pesho[2] << std::endl;
+ std::cout << "Size: " << pesho.size() << " Expected: 3" << std::endl;
+ std::cout << "Expected: " << "4 9 1" << std::endl;
+ }
+
+ if (kiro[0] == 3255 && kiro[1] == 9 && kiro[2] == 1 && kiro.size() == 3){
+ kirobool = true;
+ } else {
+ std::cout << "Processed: " << kiro[0] << " " << kiro[1] << " " << kiro[2] << std::endl;
+ std::cout << "Size: " << kiro.size() << " Expected: 3" << std::endl;
+ std::cout << "Expected: " << "3255 9 1" << std::endl;
+ }
+
+ return (peshobool && kirobool);
+
+}
+
+bool test_tokenization(){
+ StringPiece line1 = StringPiece("! ! ! ! ||| ! ! ! ! ||| 0.0804289 0.141656 0.0804289 0.443409 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 1 1");
+ StringPiece line2 = StringPiece("! ! ! ) , has ||| ! ! ! ) - , a ||| 0.0804289 0.0257627 0.0804289 0.00146736 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 5-6 ||| 1 1 1");
+ StringPiece line3 = StringPiece("! ! ! ) , ||| ! ! ! ) - , ||| 0.0804289 0.075225 0.0804289 0.00310345 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 ||| 1 1 1");
+ StringPiece line4 = StringPiece("! ! ! ) ||| ! ! ! ) . ||| 0.0804289 0.177547 0.0268096 0.000872597 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 3 1");
+
+ line_text output1 = splitLine(line1);
+ line_text output2 = splitLine(line2);
+ line_text output3 = splitLine(line3);
+ line_text output4 = splitLine(line4);
+
+ bool test1 = output1.prob == StringPiece("0.0804289 0.141656 0.0804289 0.443409 2.718");
+ bool test2 = output2.word_all1 == StringPiece("0-0 1-1 2-2 3-3 4-4 4-5 5-6");
+ bool test3 = output2.target_phrase == StringPiece("! ! ! ) - , a");
+ bool test4 = output3.source_phrase == StringPiece("! ! ! ) ,");
+ bool test5 = output4.word_all2 == StringPiece("1 3 1");
+
+ //std::cout << test1 << " " << test2 << " " << test3 << " " << test4 << std::endl;
+
+ return (test1 && test2 && test3 && test4 && test5);
+
+}
+
+bool test_linesplitter(){
+ StringPiece line1 = StringPiece("! &#93; 0.0738539 0.901133 0.0738539 0.65207 2.718 0-0 1-1 1 1 1");
+ target_text ans1;
+ ans1 = splitSingleTargetLine(line1);
+
+ /* For testing purposes
+ std::cout << ans1.target_phrase[0] << " " <<ans1.target_phrase[1] << " Size: " << ans1.target_phrase.size() << std::endl;
+ std::cout << ans1.word_all1[3] << " " << ans1.word_all2[2] << " " << ans1.prob[3] << std::endl; */
+
+ return (ans1.target_phrase.size() == 2 && ans1.prob.size() == 5 && ans1.word_all1.size() == 4 && ans1.word_all2.size() == 3);
+}
+
+bool test_linessplitter(){
+ StringPiece line1 = StringPiece("! &#93; 0.0738539 0.901133 0.0738539 0.65207 2.718 0-0 1-1 1 1 1\n\n! ) . proto došlo 0.0738539 7.14446e-06");
+ StringPiece line2 = StringPiece("! &quot; ) 0.536553 0.75961 0.634108 0.532927 2.718 0-0 1-1 2-2 13 11 8\n! ) . 0.0369269 0.00049839 0.00671399 0.00372884 2.718 0-0 1-1 2-1 2-2 2 11 1\n&quot; ! ) 0.0738539 0.75961 0.00671399 0.532927 2.718 1-0 0-1 2-2 1 11 1\nse ! &quot; ) 0.0738539 0.75961 0.00671399 0.0225211 2.718 0-1 1-2 2-3 1 11 1\n\n! &quot; , a to 0.0738539 0.0894238 0.0738539 0.048");
+
+ std::vector<target_text> ans1;
+ std::vector<target_text> ans2;
+
+ ans1 = splitTargetLine(line1);
+ ans2 = splitTargetLine(line2);
+
+ bool sizes = ans1.size() == 1 && ans2.size() == 4;
+ bool prob = ans1[0].prob[3] == 0.65207 && ans2[1].prob[1] == 0.00049839;
+ bool word_alls = ans2[0].word_all2[1] == 11 && ans2[3].word_all1[5] == 3;
+
+ /* FOr testing
+ std::cout << ans1.size() << std::endl;
+ std::cout << ans2.size() << std::endl;
+ std::cout << ans1[0].prob[3] << std::endl;
+ std::cout << ans2[1].prob[1] << std::endl;
+ std::cout << ans2[0].word_all2[1] << std::endl;
+ std::cout << ans2[3].word_all1[5] << std::endl; */
+
+ return sizes && prob && word_alls;
+}
+
+int main(){
+ if (probabilitiesTest() && wordAll1test() && wordAll2test() && test_tokenization() && test_linesplitter() && test_linessplitter() && test_vectorinsert()){
+ std::cout << "All tests pass!" << std::endl;
+ } else {
+ std::cout << "Failiure in some tests!" << std::endl;
+ }
+
+ return 1;
+} \ No newline at end of file
diff --git a/moses/TranslationModel/ProbingPT/tests/vocabid_test.cpp b/moses/TranslationModel/ProbingPT/tests/vocabid_test.cpp
new file mode 100644
index 000000000..bc82db74e
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/tests/vocabid_test.cpp
@@ -0,0 +1,45 @@
+#include <map> //Map for vocab ids
+
+#include "hash.hh"
+#include "vocabid.hh"
+
+int main(int argc, char* argv[]){
+
+ //Create a map and serialize it
+ std::map<uint64_t, std::string> vocabids;
+ StringPiece demotext = StringPiece("Demo text with 3 elements");
+ add_to_map(&vocabids, demotext);
+ //Serialize map
+ serialize_map(&vocabids, "/tmp/testmap.bin");
+
+ //Read the map and test if the values are the same
+ std::map<uint64_t, std::string> newmap;
+ read_map(&newmap, "/tmp/testmap.bin");
+
+ //Used hashes
+ uint64_t num1 = getHash(StringPiece("Demo"));
+ uint64_t num2 = getVocabID("text");
+ uint64_t num3 = getHash(StringPiece("with"));
+ uint64_t num4 = getVocabID("3");
+ uint64_t num5 = getHash(StringPiece("elements"));
+ uint64_t num6 = 0;
+
+ //Tests
+ bool test1 = getStringFromID(&newmap, num1) == getStringFromID(&vocabids, num1);
+ bool test2 = getStringFromID(&newmap, num2) == getStringFromID(&vocabids, num2);
+ bool test3 = getStringFromID(&newmap, num3) == getStringFromID(&vocabids, num3);
+ bool test4 = getStringFromID(&newmap, num4) == getStringFromID(&vocabids, num4);
+ bool test5 = getStringFromID(&newmap, num5) == getStringFromID(&vocabids, num5);
+ bool test6 = getStringFromID(&newmap, num6) == getStringFromID(&vocabids, num6);
+
+
+ if (test1 && test2 && test3 && test4 && test5 && test6){
+ std::cout << "Map was successfully written and read!" << std::endl;
+ } else {
+ std::cout << "Error! " << test1 << " " << test2 << " " << test3 << " " << test4 << " " << test5 << " " << test6 << std::endl;
+ }
+
+
+ return 1;
+
+}
diff --git a/moses/TranslationModel/ProbingPT/vocabid.cpp b/moses/TranslationModel/ProbingPT/vocabid.cpp
new file mode 100644
index 000000000..bcdbe78d0
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/vocabid.cpp
@@ -0,0 +1,29 @@
+#include "vocabid.hh"
+
+void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin){
+ //Tokenize
+ util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
+
+ while(it){
+ karta->insert(std::pair<uint64_t, std::string>(getHash(*it), it->as_string()));
+ it++;
+ }
+}
+
+void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename){
+ std::ofstream os (filename, std::ios::binary);
+ boost::archive::text_oarchive oarch(os);
+
+ oarch << *karta; //Serialise map
+ os.close();
+}
+
+void read_map(std::map<uint64_t, std::string> *karta, const char* filename){
+ std::ifstream is (filename, std::ios::binary);
+ boost::archive::text_iarchive iarch(is);
+
+ iarch >> *karta;
+
+ //Close the stream after we are done.
+ is.close();
+}
diff --git a/moses/TranslationModel/ProbingPT/vocabid.hh b/moses/TranslationModel/ProbingPT/vocabid.hh
new file mode 100644
index 000000000..491c53439
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/vocabid.hh
@@ -0,0 +1,20 @@
+//Serialization
+#include <boost/serialization/serialization.hpp>
+#include <boost/serialization/map.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <boost/archive/text_oarchive.hpp>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+#include <map> //Container
+#include "hash.hh" //Hash of elements
+
+#include "util/string_piece.hh" //Tokenization and work with StringPiece
+#include "util/tokenize_piece.hh"
+
+void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin);
+
+void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename);
+
+void read_map(std::map<uint64_t, std::string> *karta, const char* filename);
diff --git a/moses/TranslationModel/RuleTable/LoaderCompact.cpp b/moses/TranslationModel/RuleTable/LoaderCompact.cpp
index 468eaafbf..299cfe7ea 100644
--- a/moses/TranslationModel/RuleTable/LoaderCompact.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderCompact.cpp
@@ -222,11 +222,11 @@ bool RuleTableLoaderCompact::LoadRuleSection(
// The remaining columns are currently ignored.
// Create and score target phrase.
- TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase);
+ TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase, &ruleTable);
targetPhrase->SetAlignNonTerm(alignNonTerm);
targetPhrase->SetTargetLHS(targetLhs);
- targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());
+ targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
// Insert rule into table.
TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection(
diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
index dfed93bf4..967d520b3 100644
--- a/moses/TranslationModel/RuleTable/LoaderStandard.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
@@ -149,10 +149,9 @@ bool RuleTableLoaderStandard::Load(FormatType format
, size_t /* tableLimit */
, RuleTableTrie &ruleTable)
{
- PrintUserTime(string("Start loading text phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");
+ PrintUserTime(string("Start loading text phrase table. ") + (format==MosesFormat?"Moses":"Hiero") + " format");
const StaticData &staticData = StaticData::Instance();
- const std::string& factorDelimiter = staticData.GetFactorDelimiter();
string lineOrig;
size_t count = 0;
@@ -222,12 +221,10 @@ bool RuleTableLoaderStandard::Load(FormatType format
Word *targetLHS;
// create target phrase obj
- TargetPhrase *targetPhrase = new TargetPhrase();
- // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
+ TargetPhrase *targetPhrase = new TargetPhrase(&ruleTable);
targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
// source
Phrase sourcePhrase;
- // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
// rest of target phrase
@@ -247,7 +244,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
}
targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
- targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());
+ targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
phraseColl.Add(targetPhrase);
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
index 4c2f4d186..821b81c51 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
@@ -49,8 +49,8 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
bool ret = loader->Load(m_input, m_output, grammarFile, m_tableLimit,
*this);
- UTIL_THROW_IF2(ret == NULL,
- "Rules not successfully loaded for sentence id " << translationId);
+ UTIL_THROW_IF2(!ret,
+ "Rules not successfully loaded for sentence id " << translationId);
}
void PhraseDictionaryALSuffixArray::CleanUpAfterSentenceProcessing(const InputType &source)
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
index acf803f89..0b65c508b 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
@@ -213,7 +213,6 @@ void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSenten
PrintUserTime("Start loading fuzzy-match phrase model");
const StaticData &staticData = StaticData::Instance();
- const std::string& factorDelimiter = staticData.GetFactorDelimiter();
string lineOrig;
@@ -267,12 +266,10 @@ void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSenten
// source
Phrase sourcePhrase( 0);
- // sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS);
sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS);
// create target phrase obj
- TargetPhrase *targetPhrase = new TargetPhrase();
- // targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS);
+ TargetPhrase *targetPhrase = new TargetPhrase(this);
targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS);
// rest of target phrase
@@ -285,7 +282,7 @@ void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSenten
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
- targetPhrase->Evaluate(sourcePhrase, GetFeaturesToApply());
+ targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
phraseColl.Add(targetPhrase);
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
index 4b0460a17..b39903d58 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
@@ -36,6 +36,8 @@ namespace Moses
{
PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
: MyBase(line)
+ , m_maxSpanDefault(NOT_FOUND)
+ , m_maxSpanLabelled(NOT_FOUND)
{
ReadParameters();
}
@@ -57,7 +59,7 @@ ChartRuleLookupManager *PhraseDictionaryOnDisk::CreateRuleLookupManager(
return new ChartRuleLookupManagerOnDisk(parser, cellCollection, *this,
GetImplementation(),
m_input,
- m_output, m_filePath);
+ m_output);
}
OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation()
@@ -208,5 +210,19 @@ const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollectionN
return targetPhrases;
}
+void PhraseDictionaryOnDisk::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "max-span-default") {
+ m_maxSpanDefault = Scan<size_t>(value);
+ }
+ else if (key == "max-span-labelled") {
+ m_maxSpanLabelled = Scan<size_t>(value);
+ }
+ else {
+ PhraseDictionary::SetParameter(key, value);
+ }
+}
+
+
} // namespace
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h
index f32b6ca1e..4deb800f8 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h
@@ -48,6 +48,7 @@ class PhraseDictionaryOnDisk : public PhraseDictionary
{
typedef PhraseDictionary MyBase;
friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryOnDisk&);
+ friend class ChartRuleLookupManagerOnDisk;
protected:
#ifdef WITH_THREADS
@@ -56,6 +57,8 @@ protected:
boost::scoped_ptr<OnDiskPt::OnDiskWrapper> m_implementation;
#endif
+ size_t m_maxSpanDefault, m_maxSpanLabelled;
+
OnDiskPt::OnDiskWrapper &GetImplementation();
const OnDiskPt::OnDiskWrapper &GetImplementation() const;
@@ -66,10 +69,6 @@ public:
~PhraseDictionaryOnDisk();
void Load();
- PhraseTableImplementation GetPhraseTableImplementation() const {
- return OnDisk;
- }
-
// PhraseDictionary impl
virtual ChartRuleLookupManager *CreateRuleLookupManager(
const ChartParser &parser,
@@ -82,6 +81,8 @@ public:
const TargetPhraseCollection *GetTargetPhraseCollection(const OnDiskPt::PhraseNode *ptNode) const;
const TargetPhraseCollection *GetTargetPhraseCollectionNonCache(const OnDiskPt::PhraseNode *ptNode) const;
+ void SetParameter(const std::string& key, const std::string& value);
+
};
} // namespace Moses
diff --git a/moses/TranslationModel/SkeletonPT.cpp b/moses/TranslationModel/SkeletonPT.cpp
index 4fb4846f6..c1df952c1 100644
--- a/moses/TranslationModel/SkeletonPT.cpp
+++ b/moses/TranslationModel/SkeletonPT.cpp
@@ -53,7 +53,7 @@ TargetPhrase *SkeletonPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
string str = sourcePhrase.GetWord(0).GetFactor(0)->GetString().as_string();
str = "SkeletonPT:" + str;
- TargetPhrase *tp = new TargetPhrase();
+ TargetPhrase *tp = new TargetPhrase(this);
Word &word = tp->AddWord();
word.CreateFromString(Output, m_output, str, false);
@@ -62,7 +62,7 @@ TargetPhrase *SkeletonPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
tp->GetScoreBreakdown().PlusEquals(this, scores);
// score of all other ff when this rule is being loaded
- tp->Evaluate(sourcePhrase, GetFeaturesToApply());
+ tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
return tp;
}
diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile
index 1ee663044..3ac7910b2 100644
--- a/moses/TranslationModel/UG/Jamfile
+++ b/moses/TranslationModel/UG/Jamfile
@@ -3,12 +3,109 @@ try-align.cc
$(TOP)/moses//moses
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
-install $(PREFIX)/bin : try-align ;
+exe try-align2 :
+try-align2.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)/moses/TranslationModel/UG/generic//stringdist
+$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+exe ptable-describe-features :
+ptable-describe-features.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+exe count-ptable-features :
+count-ptable-features.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+exe ptable-lookup :
+ptable-lookup.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+exe sim-pe :
+sim-pe.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+exe spe-check-coverage :
+spe-check-coverage.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+exe spe-check-coverage2 :
+spe-check-coverage2.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+exe spe-check-coverage3 :
+spe-check-coverage3.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_filesystem
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+install $(PREFIX)/bin : try-align try-align2 ;
-fakelib mmsapt : [ glob *.cpp mmsapt*.cc ] ;
+fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ;
diff --git a/moses/TranslationModel/UG/count-ptable-features.cc b/moses/TranslationModel/UG/count-ptable-features.cc
new file mode 100644
index 000000000..b4d2cb4dd
--- /dev/null
+++ b/moses/TranslationModel/UG/count-ptable-features.cc
@@ -0,0 +1,26 @@
+#include "mmsapt.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+int main()
+{
+ string line;
+ getline(cin,line);
+ Mmsapt PT(line);
+ PT.Load(false);
+ cout << PT.GetFeatureNames().size() << endl;
+ exit(0);
+}
+
+
+
diff --git a/moses/TranslationModel/UG/generic/Jamfile b/moses/TranslationModel/UG/generic/Jamfile
index ed7b551d4..2a118c9c0 100644
--- a/moses/TranslationModel/UG/generic/Jamfile
+++ b/moses/TranslationModel/UG/generic/Jamfile
@@ -1 +1,2 @@
-fakelib generic : [ glob */*.cc */*.cpp ] ;
+fakelib generic : [ glob */*.cc */*.cpp : stringdist/* ] ;
+fakelib stringdist : [ glob stringdist/*.cc ] ;
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc
new file mode 100644
index 000000000..7dc2cd18f
--- /dev/null
+++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc
@@ -0,0 +1,50 @@
+//-*- c++ -*-
+#include "ug_splice_arglist.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+#include <boost/foreach.hpp>
+
+namespace Moses {
+
+ void
+ filter_arguments(int const argc_in, char const* const* const argv_in,
+ int & argc_moses, char*** argv_moses,
+ int & argc_other, char*** argv_other,
+ vector<pair<string,int> > const& filter)
+ {
+ *argv_moses = new char*[argc_in];
+ *argv_other = new char*[argc_in];
+ (*argv_moses)[0] = new char[strlen(argv_in[0])+1];
+ strcpy((*argv_moses)[0], argv_in[0]);
+ argc_moses = 1;
+ argc_other = 0;
+ typedef pair<string,int> option;
+ int i = 1;
+ while (i < argc_in)
+ {
+ BOOST_FOREACH(option const& o, filter)
+ {
+ if (o.first == argv_in[i])
+ {
+ (*argv_other)[argc_other] = new char[strlen(argv_in[i])+1];
+ strcpy((*argv_other)[argc_other++],argv_in[i]);
+ for (int k = 0; k < o.second; ++k)
+ {
+ UTIL_THROW_IF2(++i >= argc_in || argv_in[i][0] == '-',
+ "[" << HERE << "] Missing argument for "
+ << "parameter " << o.first << "!");
+ (*argv_other)[argc_other] = new char[strlen(argv_in[i])+1];
+ strcpy((*argv_other)[argc_other++],argv_in[i]);
+ }
+ if (++i >= argc_in) break;
+ }
+ }
+ if (i >= argc_in) break;
+ (*argv_moses)[argc_moses] = new char[strlen(argv_in[i])+1];
+ strcpy((*argv_moses)[argc_moses++], argv_in[i++]);
+ }
+ }
+
+} // namespace Moses
+
+
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h
new file mode 100644
index 000000000..e56585e8a
--- /dev/null
+++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h
@@ -0,0 +1,18 @@
+//-*- c++ -*-
+#pragma once
+#include <vector>
+#include <string>
+namespace Moses {
+ using namespace std;
+
+ // Function to splice the argument list (e.g. before handing it over to
+ // Moses LoadParam() function. /filter/ is a vector of argument names
+ // and the number of arguments after each of them
+ void
+ filter_arguments(int const argc_in, char const* const* const argv_in,
+ int & argc_moses, char*** argv_moses,
+ int & argc_other, char*** argv_other,
+ vector<pair<string,int> > const& filter);
+
+
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc
new file mode 100644
index 000000000..4b61ecd60
--- /dev/null
+++ b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc
@@ -0,0 +1,434 @@
+#include <string>
+#include <cassert>
+#include <iomanip>
+#include <algorithm>
+#include "ug_stringdist.h"
+// string distance measures
+// Code by Ulrich Germann
+
+namespace stringdist
+{
+
+ UErrorCode strip_accents(UnicodeString & trg)
+ {
+ UErrorCode status = U_ZERO_ERROR;
+ static Transliterator *stripper
+ = Transliterator::createInstance("NFD; [:M:] Remove; NFC",
+ UTRANS_FORWARD, status);
+ stripper->transliterate(trg);
+ return status;
+ }
+
+ char const*
+ StringDiff::
+ Segment::
+ elabel[] = { "same", "cap", "flip", "permutation",
+ "accent", "duplication",
+ "insertion", "deletion",
+ "mismatch", "noinit" };
+
+ StringDiff::
+ StringDiff()
+ {}
+
+ StringDiff::
+ StringDiff(string const& a, string const& b)
+ {
+ set_a(a);
+ set_b(b);
+ align();
+ }
+
+ StringDiff::
+ Segment::
+ Segment()
+ : start_a(-1), end_a(-1), start_b(-1), end_b(-1), match(noinit), dist(0)
+ {}
+
+ UnicodeString const&
+ StringDiff::
+ set_a(string const& a)
+ {
+ this->a = a.c_str();
+ return this->a;
+ }
+
+ UnicodeString const&
+ StringDiff::
+ set_b(string const& b)
+ {
+ this->b = b.c_str();
+ return this->b;
+ }
+
+ UnicodeString const&
+ StringDiff::
+ get_a() const
+ {
+ return this->a;
+ }
+
+ UnicodeString const&
+ StringDiff::
+ get_b() const
+ {
+ return this->b;
+ }
+
+ size_t
+ StringDiff::
+ size()
+ {
+ return this->difflist.size();
+ }
+
+ // float
+ // StringDiff::
+ // levelshtein(bool force)
+ // {
+ // align(force);
+ // float ret = 0;
+ // for (size_t i = 0; i < difflist.size(); +++i)
+ // {
+ // Segment const& s = difflist[i];
+ // if (s.match == same) continue;
+ // else if (s.match == insertion) ret += s.end_b - s.start_b;
+ // else if (s.match == deletion) ret += s.end_a - s.start_a;
+
+ // }
+ // }
+
+ void
+ StringDiff::
+ fillAlignmentMatrix(vector<vector<float> > & M) const
+ {
+ assert(a.length() && b.length());
+ M.assign(a.length(),vector<float>(b.length(),0));
+ int i = 0,j;
+ while (i < b.length() && b[i] != a[0]) ++i;
+ while (i < b.length()) M[0][i++] = 1;
+ i = 0;
+ while (i < a.length() && a[i] != b[0]) ++i;
+ while (i < a.length()) M[i++][0] = 1;
+ for (i = 1; i < a.length(); ++i)
+ {
+ for (j = 1; j < b.length(); ++j)
+ {
+ float & s = M[i][j];
+ s = max(M[i-1][j],M[i][j-1]);
+ if (a[i] == b[j])
+ s = max(s,M[i-1][j-1] + 1 + (a[i-1] == b[j-1] ? .1f : 0));
+ }
+ }
+#if 0
+ string abuf,bbuf;
+ a.toUTF8String(abuf);
+ b.toUTF8String(bbuf);
+ cout << " " << bbuf[0];
+ for (int x = 1; x < b.length(); ++x)
+ cout << " " << bbuf[x];
+ cout << endl;
+ for (int x = 0; x < a.length(); ++x)
+ {
+ cout << abuf[x] << " ";
+ for (int y = 0; y < b.length(); ++y)
+ cout << int(M[x][y]) << " ";
+ cout << endl;
+ }
+#endif
+ }
+
+ float
+ fillAlignmentMatrix(UChar const* a, size_t const lenA,
+ UChar const* b, size_t const lenB,
+ vector<vector<float> > & M)
+ {
+ M.assign(lenA,vector<float>(lenB,0));
+ assert(lenA); assert(lenB);
+ size_t i = 0;
+ while (i < lenB && b[i] != a[0]) ++i;
+ while (i < lenB) M[0][i++] = 1;
+ i = 0;
+ while (i < lenA && a[i] != b[0]) ++i;
+ while (i < lenA) M[i++][0] = 1;
+ for (i = 1; i < lenA; ++i)
+ {
+ for (size_t j = 1; j < lenB; ++j)
+ {
+ float & s = M[i][j];
+ s = max(M[i-1][j], M[i][j-1]);
+ if (a[i] == b[j])
+ s = max(s, M[i-1][j-1] + 1);
+ }
+ }
+ return M.back().back();
+ }
+
+ float
+ levenshtein(UChar const* a, size_t const lenA,
+ UChar const* b, size_t const lenB)
+ {
+ vector<vector<float> > M;
+ fillAlignmentMatrix(a,lenA,b,lenB,M);
+ size_t ret = 0;
+#define DEBUGME 0
+#if DEBUGME
+ for (size_t i = 0; i < M.size(); ++i)
+ {
+ for (size_t j = 0; j < M[i].size(); ++j)
+ cout << M[i][j] << " ";
+ cout << endl;
+ }
+ cout << string(25,'-') << endl;
+#endif
+
+ int i = M.size() -1;
+ int j = M.back().size() -1;
+ int I=i, J=j;
+ for (;i >= 0 || j >= 0; --i, --j)
+ {
+ I=i, J=j;
+ if (j>=0) while (i > 0 && M[i-1][j] == M[i][j]) --i;
+ if (i>=0) while (j > 0 && M[i][j-1] == M[i][j]) --j;
+ size_t ilen = I >= 0 ? I - i : 0;
+ size_t jlen = J >= 0 ? J - j : 0;
+ ret += max(ilen,jlen);
+#if DEBUGME
+ cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
+#endif
+ I=i, J=j;
+ }
+ size_t ilen = I >= 0 ? I - i : 0;
+ size_t jlen = J >= 0 ? J - j : 0;
+ ret += max(ilen,jlen);
+#if DEBUGME
+ cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
+#endif
+ return ret;
+ }
+
+
+
+ StringDiff::
+ Segment::
+ Segment(size_t const as, size_t const ae,
+ size_t const bs, size_t const be,
+ UnicodeString const& a,
+ UnicodeString const& b)
+ {
+ dist = 0;
+ start_a = as; end_a = ae;
+ start_b = bs; end_b = be;
+ if (as == ae)
+ match = bs == be ? same : insertion;
+ else if (bs == be)
+ match = deletion;
+ else if (be-bs != ae-as)
+ {
+ match = mismatch;
+ dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
+ b.getBuffer() + bs, be - bs);
+ }
+ else
+ {
+ match = same;
+ size_t stop = ae-as;
+ for (size_t i = 0; i < stop && match == same; ++i)
+ if (a[as+i] != b[bs+i]) match = mismatch;
+ if (match == mismatch)
+ {
+ if (ae-as == 2 && a[as] == b[bs+1] && a[as+1] == b[bs])
+ match = flip;
+ else
+ {
+ vector<UChar> x(a.getBuffer() + as, a.getBuffer() + ae);
+ vector<UChar> y(b.getBuffer() + bs, b.getBuffer() + be);
+ sort(x.begin(),x.end());
+ sort(y.begin(),y.end());
+ if (x == y) match = permutation;
+ else dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
+ b.getBuffer() + bs, be - bs);
+ }
+ }
+ }
+ if (match == insertion)
+ {
+ dist = be-bs;
+ }
+ else if (match == deletion)
+ {
+ dist = ae-as;
+ }
+ else if (match == flip) dist = 1;
+ else if (match == permutation) dist = ae-as-1;
+ if (match == mismatch)
+ {
+ UnicodeString ax(a,as,ae-as);
+ UnicodeString bx(b,bs,be-bs);
+ if (ax.toLower() == bx.toLower())
+ match = cap;
+ else
+ {
+ strip_accents(ax);
+ strip_accents(bx);
+ if (ax == bx) match = accent;
+ }
+ }
+ }
+
+ size_t
+ StringDiff::
+ align(bool force)
+ {
+ if (force) difflist.clear();
+ if (difflist.size()) return 0;
+ vector<vector<float> > M;
+ fillAlignmentMatrix(M);
+ // now backtrack
+ int i = a.length() - 1;
+ int j = b.length() - 1;
+ vector<int> A(a.length(), -1);
+ vector<int> B(b.length(), -1);
+ while (i + j)
+ {
+ while (i && M[i-1][j] == M[i][j]) --i;
+ while (j && M[i][j-1] == M[i][j]) --j;
+ if (a[i] == b[j]) { A[i] = j; B[j] = i; }
+ if (i) --i;
+ if (j) --j;
+ }
+ i = a.length() - 1;
+ j = b.length() - 1;
+ vector<int> A2(a.length(), -1);
+ vector<int> B2(b.length(), -1);
+ while (i + j)
+ {
+ while (j && M[i][j-1] == M[i][j]) --j;
+ while (i && M[i-1][j] == M[i][j]) --i;
+ if (a[i] == b[j]) { A2[i] = j; B2[j] = i; }
+ if (i) --i;
+ if (j) --j;
+ }
+ for (size_t k = 0; k < A.size(); ++k)
+ A[k] = min(A[k],A2[k]);
+ for (size_t k = 0; k < B.size(); ++k)
+ B[k] = min(B[k],B2[k]);
+
+ if (a[i] == b[j]) { A[i] = j; B[j] = i; }
+ i = 0;
+ j = 0;
+ size_t I, J;
+ while (i < a.length() and j < b.length())
+ {
+ if (A[i] < 0)
+ {
+ I = i + 1;
+ while (I < A.size() and A[I] < 0) ++I;
+ if (i)
+ { for (J = j = A[i-1]+1; J < B.size() && B[J] < 0; ++J); }
+ else if (I < A.size())
+ { for (j = J = A[I]; j && B[j-1] < 0; --j); }
+ else J = B.size();
+ difflist.push_back(Segment(i,I,j,J,a,b));
+ i = I; j = J;
+ }
+ else if (B[j] < 0)
+ {
+ for (J = j + 1; J < B.size() && B[J] < 0; ++J);
+ difflist.push_back(Segment(i,i,j,J,a,b));
+ j = J;
+ }
+ else
+ {
+ I = i;
+ J = j;
+ while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0)
+ { ++I; ++J; }
+ difflist.push_back(Segment(i,I,j,J,a,b));
+ i = I; j = J;
+ }
+ }
+ if (i < a.length() || j < b.length())
+ difflist.push_back(Segment(i,a.length(),j,b.length(),a,b));
+
+ diffcnt.assign(noinit,0);
+ for (size_t i = 0; i < difflist.size(); ++i)
+ {
+ Segment & s = difflist[i];
+ if (s.match == insertion and
+ ((s.start_a and a[s.start_a - 1] == b[s.start_b]) or
+ (s.end_a < a.length() and a[s.end_a] == b[s.start_b])))
+ {
+ bool sameletter = true;
+ for (int i = s.start_b + 1; sameletter and i < s.end_b; ++i)
+ sameletter = b[i] == b[i-1];
+ if (sameletter) s.match = duplication;
+ }
+ else if (s.match == deletion and
+ ((s.start_b and b[s.start_b - 1] == a[s.start_a]) or
+ (s.end_b < b.length() and b[s.end_b] == a[s.start_a])))
+ {
+ bool sameletter = true;
+ for (int i = s.start_a + 1; sameletter and i < s.end_a; ++i)
+ sameletter = a[i] == a[i-1];
+ if (sameletter) s.match= duplication;
+ }
+ ++diffcnt[s.match];
+ }
+ return 0;
+ }
+
+ void
+ StringDiff::
+ showDiff(std::ostream& out)
+ {
+ if (difflist.size() == 0) align();
+ vector<size_t> fromEnd(difflist.size(),0);
+ for (int d = difflist.size()-1; d-- > 0;)
+ {
+ fromEnd[d] = a.length() - difflist[d].end_a;
+ // cout << d << " " << fromEnd[d] << " "
+ // << difflist[d].start_a << "-"
+ // << difflist[d].end_a << endl;
+ }
+ for (size_t d = 0; d < difflist.size(); ++d)
+ {
+ Segment const& s = difflist[d];
+ UnicodeString aseg,bseg;
+ a.extract(s.start_a, s.end_a - s.start_a, aseg);
+ b.extract(s.start_b, s.end_b - s.start_b, bseg);
+ string abuf,bbuf;
+ aseg.toUTF8String(abuf);
+ bseg.toUTF8String(bbuf);
+ out << abuf << " ";
+ out << bbuf << " ";
+ out << s.label() << " "
+ << s.dist << " "
+ << fromEnd[d]
+ << endl;
+ }
+ }
+
+ char const*
+ StringDiff::
+ Segment::
+ label() const
+ {
+ return elabel[this->match];
+ }
+
+ StringDiff::Segment const&
+ StringDiff::
+ operator[](uint32_t const i) const
+ {
+ return difflist.at(i);
+ }
+
+ vector<int> const&
+ StringDiff::
+ getFeatures() const
+ {
+ return diffcnt;
+ }
+
+}
diff --git a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h
new file mode 100644
index 000000000..43fb089f1
--- /dev/null
+++ b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h
@@ -0,0 +1,87 @@
+//-*- c++ -*-
+#pragma once
+
+// string distance measures
+// Code by Ulrich Germann
+#include<iostream>
+
+
+#include <unicode/stringpiece.h>
+#include <unicode/translit.h>
+#include <unicode/utypes.h>
+#include <unicode/unistr.h>
+#include <unicode/uchar.h>
+#include <unicode/utf8.h>
+#include <vector>
+
+#include "moses/TranslationModel/UG/mm/tpt_typedefs.h"
+
+
+using namespace std;
+//using namespace boost;
+using namespace ugdiss;
+
+namespace stringdist
+{
+ float
+ levenshtein(UChar const* a, size_t const lenA,
+ UChar const* b, size_t const lenB);
+
+ UErrorCode strip_accents(UnicodeString & trg);
+
+ float
+ fillAlignmentMatrix(UChar const* a, size_t const lenA,
+ UChar const* b, size_t const lenB,
+ vector<vector<float> > & M);
+
+ class StringDiff
+ {
+ public:
+ enum MATCHTYPE
+ {
+ same, // a and b are identical
+ cap, // a and b differ only in capitalization
+ flip, // two-letter flip
+ permutation, // a and b have same letters but in different order
+ accent, // a and b are the same basic letters, ignoring accents
+ duplication, // a is empty
+ insertion, // a is empty
+ deletion, // b is empty
+ mismatch, // none of the above
+ noinit // not initialized
+ };
+
+ struct Segment
+ {
+ static char const* elabel[];
+ int start_a, end_a;
+ int start_b, end_b;
+ MATCHTYPE match;
+ float dist;
+ Segment();
+ Segment(size_t const as, size_t const ae,
+ size_t const bs, size_t const be,
+ UnicodeString const& a,
+ UnicodeString const& b);
+ char const* label() const;
+ };
+ private:
+ UnicodeString a,b;
+ vector<Segment> difflist;
+ vector<int> diffcnt;
+ public:
+ UnicodeString const& set_a(string const& a);
+ UnicodeString const& set_b(string const& b);
+ UnicodeString const& get_a() const;
+ UnicodeString const& get_b() const;
+ StringDiff(string const& a, string const& b);
+ StringDiff();
+ size_t size();
+ size_t align(bool force=false); // returns the levenshtein distance
+ void showDiff(std::ostream& out);
+ float levenshtein();
+ Segment const& operator[](uint32_t i) const;
+ void fillAlignmentMatrix(vector<vector<float> > & M) const;
+ vector<int> const& getFeatures() const;
+ };
+}
diff --git a/moses/TranslationModel/UG/mm/Jamfile b/moses/TranslationModel/UG/mm/Jamfile
index 2cc923581..e78338b3d 100644
--- a/moses/TranslationModel/UG/mm/Jamfile
+++ b/moses/TranslationModel/UG/mm/Jamfile
@@ -18,6 +18,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
+exe mtt-demo1 :
+mtt-demo1.cc
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/util//kenutil
+;
+
exe mtt-count-words :
mtt-count-words.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
@@ -72,15 +81,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
-exe custom-pt :
-custom-pt.cc
-$(TOP)/moses//moses
-$(TOP)//boost_iostreams
-$(TOP)//boost_program_options
-$(TOP)/moses/TranslationModel/UG/mm//mm
-$(TOP)/moses/TranslationModel/UG/generic//generic
-$(TOP)/util//kenutil
-;
+# exe custom-pt :
+# custom-pt.cc
+# $(TOP)/moses//moses
+# $(TOP)//boost_iostreams
+# $(TOP)//boost_program_options
+# $(TOP)/moses/TranslationModel/UG/mm//mm
+# $(TOP)/moses/TranslationModel/UG/generic//generic
+# $(TOP)/util//kenutil
+# ;
exe calc-coverage :
@@ -98,12 +107,11 @@ mtt-dump
mtt-count-words
symal2mam
mam2symal
-custom-pt
mmlex-build
mmlex-lookup
mam_verify
calc-coverage
;
-fakelib mm : [ glob ug_*.cc tpt_*.cc ] ;
+fakelib mm : [ glob ug_*.cc tpt_*.cc num_read_write.cc ] ;
diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc
index 9de67ff95..93c8c0eb0 100644
--- a/moses/TranslationModel/UG/mm/custom-pt.cc
+++ b/moses/TranslationModel/UG/mm/custom-pt.cc
@@ -1,6 +1,6 @@
// build a phrase table for the given input
// #include "ug_lexical_phrase_scorer2.h"
-
+#if 0
#include <stdint.h>
#include <string>
#include <vector>
@@ -24,7 +24,7 @@
#include "tpt_pickler.h"
#include "ug_bitext.h"
#include "ug_lexical_phrase_scorer2.h"
-
+#include "../sapt_phrase_scorers.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
@@ -44,8 +44,8 @@ float lbsmooth = .005;
PScorePfwd<Token> calc_pfwd;
PScorePbwd<Token> calc_pbwd;
-PScoreLex<Token> calc_lex;
-PScoreWP<Token> apply_wp;
+PScoreLex<Token> calc_lex(1.0);
+PScoreWC<Token> apply_wp;
vector<float> fweights;
void
@@ -109,6 +109,7 @@ int main(int argc, char* argv[])
{
// assert(argc == 4);
#if 0
+#if 0
string base = argv[1];
string L1 = argv[2];
string L2 = argv[3];
@@ -129,8 +130,8 @@ int main(int argc, char* argv[])
bt.setDefaultSampleSize(max_samples);
size_t i;
- i = calc_pfwd.init(0,.05);
- i = calc_pbwd.init(i,.05);
+ i = calc_pfwd.init(0,.05,'g');
+ i = calc_pbwd.init(i,.05,'g');
i = calc_lex.init(i,base+L1+"-"+L2+".lex");
i = apply_wp.init(i);
@@ -181,7 +182,7 @@ int main(int argc, char* argv[])
}
}
}
-
+#endif
exit(0);
}
-
+#endif
diff --git a/moses/TranslationModel/UG/mm/mmlex-lookup.cc b/moses/TranslationModel/UG/mm/mmlex-lookup.cc
index 14d839edf..fbdceeaa0 100644
--- a/moses/TranslationModel/UG/mm/mmlex-lookup.cc
+++ b/moses/TranslationModel/UG/mm/mmlex-lookup.cc
@@ -131,7 +131,7 @@ interpret_args(int ac, char* av[])
o.add_options()
("help,h", "print this message")
("source,s",po::value<string>(&swrd),"source word")
- ("target,t",po::value<string>(&swrd),"target word")
+ ("target,t",po::value<string>(&twrd),"target word")
;
h.add_options()
diff --git a/moses/TranslationModel/UG/mm/mtt-build.cc b/moses/TranslationModel/UG/mm/mtt-build.cc
index 0518e7161..49fd7f6c2 100644
--- a/moses/TranslationModel/UG/mm/mtt-build.cc
+++ b/moses/TranslationModel/UG/mm/mtt-build.cc
@@ -361,7 +361,7 @@ build_mmTSA(string infile, string outfile)
{
size_t mypid = fork();
if(mypid) return mypid;
- shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile));
+ boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile));
bdBitset filter;
filter.resize(T->size(),true);
imTSA<Token> S(T,&filter,(quiet?NULL:&cerr));
diff --git a/moses/TranslationModel/UG/mm/mtt-demo1.cc b/moses/TranslationModel/UG/mm/mtt-demo1.cc
new file mode 100644
index 000000000..a253e9ed3
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/mtt-demo1.cc
@@ -0,0 +1,54 @@
+// -*- c++ -*-
+// Demo program for use of single-track suffix arrays
+
+#include <boost/program_options.hpp>
+#include <iomanip>
+
+#include "tpt_typedefs.h"
+#include "ug_mm_ttrack.h"
+#include "ug_mm_tsa.h"
+#include "tpt_tokenindex.h"
+#include "ug_deptree.h"
+#include "ug_corpus_token.h"
+
+using namespace Moses;
+using namespace std;
+using namespace boost;
+using namespace ugdiss;
+typedef L2R_Token < SimpleWordId > Token;
+int main(int argc, char* argv[])
+{
+ using namespace std;
+ if (argc < 3)
+ {
+ cerr << "usage: " << argv[0] << " <track base name> lookup word sequence"
+ << endl;
+ }
+ string base = argv[1];
+ TokenIndex V;
+ V.open(base+".tdx");
+ boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>());
+ T->open(base+".mct");
+ mmTSA<Token> I; I.open(base+".sfa",T);
+ mmTSA<Token>::tree_iterator m(&I);
+
+ // look up the search string m.extend() returns true upon success
+ for (int i = 2; i < argc && m.extend(V[argv[i]]); ++i);
+ if (int(m.size() + 2) < argc)
+ {
+ cerr << "NOT FOUND" << endl;
+ exit(1);
+ }
+
+ tsa::ArrayEntry e(m.lower_bound(-1));
+ char const* stop = m.upper_bound(-1);
+ do
+ {
+ m.root->readEntry(e.next,e);
+ Token const* t = T->sntStart(e.sid) + e.offset;
+ Token const* z = T->sntEnd(e.sid);
+ for (;t != z; t = t->next()) cout << V[t->id()] << " ";
+ cout << endl;
+ } while (e.next != stop);
+
+}
diff --git a/moses/TranslationModel/UG/mm/num_read_write.cc b/moses/TranslationModel/UG/mm/num_read_write.cc
new file mode 100644
index 000000000..403f7d300
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/num_read_write.cc
@@ -0,0 +1,74 @@
+#include "num_read_write.h"
+namespace ugdiss {
+ typedef unsigned char uchar;
+
+ void
+ numwrite(std::ostream& out, uint16_t const& x)
+ {
+ char buf[2];
+ buf[0] = x%256;
+ buf[1] = (x>>8)%256;
+ out.write(buf,2);
+ }
+
+ void
+ numwrite(std::ostream& out, uint32_t const& x)
+ {
+ char buf[4];
+ buf[0] = x%256;
+ buf[1] = (x>>8)%256;
+ buf[2] = (x>>16)%256;
+ buf[3] = (x>>24)%256;
+ out.write(buf,4);
+ }
+
+ void
+ numwrite(std::ostream& out, uint64_t const& x)
+ {
+ char buf[8];
+ buf[0] = x%256;
+ buf[1] = (x>>8)%256;
+ buf[2] = (x>>16)%256;
+ buf[3] = (x>>24)%256;
+ buf[4] = (x>>32)%256;
+ buf[5] = (x>>40)%256;
+ buf[6] = (x>>48)%256;
+ buf[7] = (x>>56)%256;
+ out.write(buf,8);
+ }
+
+ char const*
+ numread(char const* src, uint16_t & x)
+ {
+ uchar const* d = reinterpret_cast<uchar const*>(src);
+ x = (uint16_t(d[0])<<0) | (uint16_t(d[1])<<8);
+ return src+2;
+ }
+
+ char const*
+ numread(char const* src, uint32_t & x)
+ {
+ uchar const* d = reinterpret_cast<uchar const*>(src);
+ x = ((uint32_t(d[0])<<0) |
+ (uint32_t(d[1])<<8) |
+ (uint32_t(d[2])<<16)|
+ (uint32_t(d[3])<<24));
+ return src+4;
+ }
+
+ char const*
+ numread(char const* src, uint64_t & x)
+ {
+ uchar const* d = reinterpret_cast<uchar const*>(src);
+ x = ((uint64_t(d[0])<<0) |
+ (uint64_t(d[1])<<8) |
+ (uint64_t(d[2])<<16) |
+ (uint64_t(d[3])<<24) |
+ (uint64_t(d[4])<<32) |
+ (uint64_t(d[5])<<40) |
+ (uint64_t(d[6])<<48) |
+ (uint64_t(d[7])<<56));
+ return src+8;
+ }
+
+}
diff --git a/moses/TranslationModel/UG/mm/num_read_write.h b/moses/TranslationModel/UG/mm/num_read_write.h
index 96630f4b0..6fdcecc81 100644
--- a/moses/TranslationModel/UG/mm/num_read_write.h
+++ b/moses/TranslationModel/UG/mm/num_read_write.h
@@ -1,66 +1,78 @@
// -*- c++ -*-
// (c) 2006,2007,2008 Ulrich Germann
-#ifndef __num_read_write_hh
-#define __num_read_write_hh
+// #ifndef __num_read_write_hh
+// #define __num_read_write_hh
+#pragma once
#include <stdint.h>
#include <iostream>
-#include <endian.h>
-#include <byteswap.h>
-#include "tpt_typedefs.h"
+// #include <endian.h>
+// #include <byteswap.h>
+// #include "tpt_typedefs.h"
namespace ugdiss {
+
+ void numwrite(std::ostream& out, uint16_t const& x);
+ void numwrite(std::ostream& out, uint32_t const& x);
+ void numwrite(std::ostream& out, uint64_t const& x);
+
+ char const* numread(char const* src, uint16_t & x);
+ char const* numread(char const* src, uint32_t & x);
+ char const* numread(char const* src, uint64_t & x);
-template<typename uintNumber>
-void
-numwrite(std::ostream& out, uintNumber const& x)
-{
-#if __BYTE_ORDER == __BIG_ENDIAN
- uintNumber y;
- switch (sizeof(uintNumber))
- {
- case 2: y = bswap_16(x); break;
- case 4: y = bswap_32(x); break;
- case 8: y = bswap_64(x); break;
- default: y = x;
- }
- out.write(reinterpret_cast<char*>(&y),sizeof(y));
-#else
- out.write(reinterpret_cast<char const*>(&x),sizeof(x));
-#endif
-}
+// template<typename uintNumber>
+// void
+// numwrite(std::ostream& out, uintNumber const& x)
+// {
+// uchar const* c = reinterpret_cast<uchar const*>(&x);
+// for (size_t i = 0; i < sizeof(x); ++i)
+// out.write(c
+// #if __BYTE_ORDER == __BIG_ENDIAN
+// uintNumber y;
+// switch (sizeof(uintNumber))
+// {
+// case 2: y = bswap_16(x); break;
+// case 4: y = bswap_32(x); break;
+// case 8: y = bswap_64(x); break;
+// default: y = x;
+// }
+// out.write(reinterpret_cast<char*>(&y),sizeof(y));
+// #else
+// out.write(reinterpret_cast<char const*>(&x),sizeof(x));
+// #endif
+// }
-template<typename uintNumber>
-void
-numread(std::istream& in, uintNumber& x)
-{
- in.read(reinterpret_cast<char*>(&x),sizeof(uintNumber));
-#if __BYTE_ORDER == __BIG_ENDIAN
- switch (sizeof(uintNumber))
- {
- case 2: x = bswap_16(x); break;
- case 4: x = bswap_32(x); break;
- case 8: x = bswap_64(x); break;
- default: break;
- }
-#endif
-}
+// template<typename uintNumber>
+// void
+// numread(std::istream& in, uintNumber& x)
+// {
+// in.read(reinterpret_cast<char*>(&x),sizeof(uintNumber));
+// #if __BYTE_ORDER == __BIG_ENDIAN
+// switch (sizeof(uintNumber))
+// {
+// case 2: x = bswap_16(x); break;
+// case 4: x = bswap_32(x); break;
+// case 8: x = bswap_64(x); break;
+// default: break;
+// }
+// #endif
+// }
-template<typename uintNumber>
-char const*
-numread(char const* src, uintNumber& x)
-{
- // ATTENTION: THIS NEEDS TO BE VERIFIED FOR BIG-ENDIAN MACHINES!!!
- x = *reinterpret_cast<uintNumber const*>(src);
-#if __BYTE_ORDER == __BIG_ENDIAN
- switch (sizeof(uintNumber))
- {
- case 2: x = bswap_16(x); break;
- case 4: x = bswap_32(x); break;
- case 8: x = bswap_64(x); break;
- default: break;
- }
-#endif
- return src+sizeof(uintNumber);
-}
+// template<typename uintNumber>
+// char const*
+// numread(char const* src, uintNumber& x)
+// {
+// // ATTENTION: THIS NEEDS TO BE VERIFIED FOR BIG-ENDIAN MACHINES!!!
+// x = *reinterpret_cast<uintNumber const*>(src);
+// #if __BYTE_ORDER == __BIG_ENDIAN
+// switch (sizeof(uintNumber))
+// {
+// case 2: x = bswap_16(x); break;
+// case 4: x = bswap_32(x); break;
+// case 8: x = bswap_64(x); break;
+// default: break;
+// }
+// #endif
+// return src+sizeof(uintNumber);
+// }
} // end of namespace ugdiss
-#endif
+//#endif
diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.cc b/moses/TranslationModel/UG/mm/tpt_pickler.cc
index 87fad195c..c23913fc2 100644
--- a/moses/TranslationModel/UG/mm/tpt_pickler.cc
+++ b/moses/TranslationModel/UG/mm/tpt_pickler.cc
@@ -268,9 +268,7 @@ namespace ugdiss
}
- template<>
- char const*
- binread<uint16_t>(char const* p, uint16_t& buf)
+ char const *binread(char const* p, uint16_t& buf)
{
static char mask = 127;
buf = (*p)&mask;
@@ -286,9 +284,14 @@ namespace ugdiss
return p;
}
- template<>
- char const*
- binread<uint32_t>(char const* p, uint32_t& buf)
+#ifdef __clang__
+ char const *binread(char const* p, size_t& buf)
+ {
+ return binread(p, (uint32_t&) buf);
+ }
+#endif
+
+ char const *binread(char const* p, uint32_t& buf)
{
static char mask = 127;
@@ -325,9 +328,7 @@ namespace ugdiss
return ++p;
}
- template<>
- char const*
- binread<filepos_type>(char const* p, filepos_type& buf)
+ char const *binread(char const* p, filepos_type& buf)
{
static char mask = 127;
@@ -394,9 +395,7 @@ namespace ugdiss
return ++p;
}
- template<>
- char const*
- binread<float>(char const* p, float& buf)
+ char const *binread(char const* p, float& buf)
{
buf = *reinterpret_cast<float const*>(p);
return p+sizeof(float);
diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.h b/moses/TranslationModel/UG/mm/tpt_pickler.h
index fa603b3b6..7305a858e 100644
--- a/moses/TranslationModel/UG/mm/tpt_pickler.h
+++ b/moses/TranslationModel/UG/mm/tpt_pickler.h
@@ -15,7 +15,7 @@ namespace ugdiss
{
/// Utility method placed here for lack of a better place
/// @return the size of file fname.
- uint64_t getFileSize(const std::string& fname);
+ ::uint64_t getFileSize(const std::string& fname);
/**
* The following functions write and read data in a compact binary
@@ -42,6 +42,14 @@ namespace ugdiss
void binread(std::istream& in, std::string &data);
void binread(std::istream& in, float &data);
+ char const *binread(char const* p, uint16_t& buf);
+ char const *binread(char const* p, uint32_t& buf);
+ char const *binread(char const* p, filepos_type& buf);
+ char const *binread(char const* p, float& buf);
+#ifdef __clang__
+ char const *binread(char const* p, size_t& buf);
+#endif
+
std::ostream& write(std::ostream& out, char x);
std::ostream& write(std::ostream& out, unsigned char x);
std::ostream& write(std::ostream& out, short x);
@@ -58,6 +66,7 @@ namespace ugdiss
std::istream& read(std::istream& in, size_t& x);
std::istream& read(std::istream& in, float& x);
+ /*
template<typename WHATEVER>
char const*
binread(char const* p, WHATEVER* buf);
@@ -65,6 +74,7 @@ namespace ugdiss
template<typename numtype>
char const*
binread(char const* p, numtype& buf);
+ */
template<typename K, typename V>
void binwrite(std::ostream& out, std::pair<K,V> const& data);
@@ -93,11 +103,11 @@ namespace ugdiss
template<typename V>
char const* binread(char const* p, std::vector<V>& v)
{
- size_t vsize;
+ size_t vsize;
#ifdef VERIFY_TIGHT_PACKING
assert(p);
#endif
- p = binread(p,vsize);
+ p = binread(p, vsize);
v.resize(vsize);
for (size_t i = 0; i < vsize; ++i)
p = binread(p,v[i]);
@@ -199,9 +209,6 @@ namespace ugdiss
return binread(p,*buf);
}
- template<typename numtype>
- char const*
- binread(char const* p, numtype& buf);
} // end namespace ugdiss
#endif
diff --git a/moses/TranslationModel/UG/mm/tpt_tightindex.h b/moses/TranslationModel/UG/mm/tpt_tightindex.h
index 27d2ba059..66594bc0a 100644
--- a/moses/TranslationModel/UG/mm/tpt_tightindex.h
+++ b/moses/TranslationModel/UG/mm/tpt_tightindex.h
@@ -10,8 +10,9 @@
#include <iostream>
#include <sstream>
#include "tpt_typedefs.h"
+// #include <stdint.h>
#include <cassert>
-// using namespace std;
+using namespace std;
#ifndef uchar
#endif
@@ -28,7 +29,7 @@ namespace ugdiss
{
// void tightwritex(iostream& out, size_t data, bool flag);
void
- tightwrite(std::ostream& out, uint64_t data, bool flag);
+ tightwrite(std::ostream& out, ::uint64_t data, bool flag);
filepos_type
tightread(std::istream& in, std::ios::pos_type stop);
@@ -90,7 +91,7 @@ namespace ugdiss
tightread4(char const* start, char const* stop, uint32_t& dest);
char const*
- tightread8(char const* start, char const* stop, uint64_t& dest);
+ tightread8(char const* start, char const* stop, ::uint64_t& dest);
template<typename numType>
char const*
@@ -101,13 +102,13 @@ namespace ugdiss
if (sizeof(numType)==4)
return tightread4(start,stop,reinterpret_cast<uint32_t&>(dest));
else if (sizeof(numType)==8)
- return tightread8(start,stop,reinterpret_cast<uint64_t&>(dest));
+ return tightread8(start,stop,reinterpret_cast<typename ::uint64_t&>(dest));
assert(0);
return NULL;
}
// char const*
-// tightread(char const* start, char const* stop, uint64_t& dest);
+// tightread(char const* start, char const* stop, ::uint64_t& dest);
// char const*
// tightread(char const* start, char const* stop, filepos_type& dest);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc
index c4f5175f3..944b327df 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@@ -11,7 +11,9 @@ namespace Moses
namespace bitext
{
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
ThreadSafeCounter pstats::active;
+#endif
pstats::
pstats()
@@ -23,14 +25,15 @@ namespace Moses
{
ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
- // if (++active%5 == 0)
- // cerr << size_t(active) << " active pstats at " << __FILE__ << ":" << __LINE__ << endl;
}
pstats::
~pstats()
{
- --active;
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ // counter may not exist any more at destruction time, so try ... catch
+ try { --active; } catch (...) {}
+#endif
}
void
@@ -54,7 +57,7 @@ namespace Moses
bool
pstats::
- add(uint64_t pid, float const w,
+ add(::uint64_t pid, float const w,
vector<uchar> const& a,
uint32_t const cnt2,
uint32_t fwd_o,
@@ -158,187 +161,35 @@ namespace Moses
jstats::
invalidate()
{
- my_rcnt = 0;
+ if (my_wcnt > 0)
+ my_wcnt *= -1;
}
- bool
+ void
jstats::
- valid()
- {
- return my_rcnt != 0;
- }
-
- bool
- PhrasePair::
- operator<=(PhrasePair const& other) const
- {
- return this->score <= other.score;
- }
-
- bool
- PhrasePair::
- operator>=(PhrasePair const& other) const
+ validate()
{
- return this->score >= other.score;
+ if (my_wcnt < 0)
+ my_wcnt *= -1;
}
bool
- PhrasePair::
- operator<(PhrasePair const& other) const
- {
- return this->score < other.score;
- }
-
- bool
- PhrasePair::
- operator>(PhrasePair const& other) const
- {
- return this->score > other.score;
- }
-
- PhrasePair::
- PhrasePair() {}
-
- PhrasePair::
- PhrasePair(PhrasePair const& o)
- : p1(o.p1),
- p2(o.p2),
- raw1(o.raw1),
- raw2(o.raw2),
- sample1(o.sample1),
- sample2(o.sample2),
- good1(o.good1),
- good2(o.good2),
- joint(o.joint),
- fvals(o.fvals),
- aln(o.aln),
- score(o.score)
- {
- for (size_t i = 0; i <= po_other; ++i)
- {
- dfwd[i] = o.dfwd[i];
- dbwd[i] = o.dbwd[i];
- }
- }
-
- void
- PhrasePair::
- init(uint64_t const pid1, pstats const& ps, size_t const numfeats)
+ jstats::
+ valid()
{
- p1 = pid1;
- p2 = 0;
- raw1 = ps.raw_cnt;
- sample1 = ps.sample_cnt;
- sample2 = 0;
- good1 = ps.good;
- good2 = 0;
- raw2 = 0;
- fvals.resize(numfeats);
+ return my_wcnt >= 0;
}
- void
- PhrasePair::
- init(uint64_t const pid1,
- pstats const& ps1,
- pstats const& ps2,
- size_t const numfeats)
- {
- p1 = pid1;
- raw1 = ps1.raw_cnt + ps2.raw_cnt;
- sample1 = ps1.sample_cnt + ps2.sample_cnt;
- sample2 = 0;
- good1 = ps1.good + ps2.good;
- good2 = 0;
- fvals.resize(numfeats);
- }
float
lbop(size_t const tries, size_t const succ, float const confidence)
{
- return
- boost::math::binomial_distribution<>::
- find_lower_bound_on_p(tries, succ, confidence);
+ return (confidence == 0
+ ? float(succ)/tries
+ : (boost::math::binomial_distribution<>::
+ find_lower_bound_on_p(tries, succ, confidence)));
}
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2, jstats const& js)
- {
- p2 = pid2;
- raw2 = js.cnt2();
- joint = js.rcnt();
- assert(js.aln().size());
- if (js.aln().size())
- aln = js.aln()[0].second;
- float total_fwd = 0, total_bwd = 0;
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- total_fwd += js.dcnt_fwd(po)+1;
- total_bwd += js.dcnt_bwd(po)+1;
- }
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
- dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
- }
- return *this;
- }
-
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2, jstats const& js1, jstats const& js2)
- {
- p2 = pid2;
- raw2 = js1.cnt2() + js2.cnt2();
- joint = js1.rcnt() + js2.rcnt();
- assert(js1.aln().size() || js2.aln().size());
- if (js1.aln().size())
- aln = js1.aln()[0].second;
- else if (js2.aln().size())
- aln = js2.aln()[0].second;
- for (int i = po_first; i < po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
- dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
- }
- return *this;
- }
-
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2,
- size_t const raw2extra,
- jstats const& js)
- {
- p2 = pid2;
- raw2 = js.cnt2() + raw2extra;
- joint = js.rcnt();
- assert(js.aln().size());
- if (js.aln().size())
- aln = js.aln()[0].second;
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
- dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
- }
- return *this;
- }
-
- float
- PhrasePair::
- eval(vector<float> const& w)
- {
- assert(w.size() == this->fvals.size());
- this->score = 0;
- for (size_t i = 0; i < w.size(); ++i)
- this->score += w[i] * this->fvals[i];
- return this->score;
- }
-
template<>
sptr<imBitext<L2R_Token<SimpleWordId> > >
imBitext<L2R_Token<SimpleWordId> >::
@@ -355,7 +206,7 @@ namespace Moses
sptr<imBitext<TKN> > ret;
{
- lock_guard<mutex> guard(this->lock);
+ boost::lock_guard<boost::mutex> guard(this->lock);
ret.reset(new imBitext<TKN>(*this));
}
@@ -370,7 +221,8 @@ namespace Moses
uint32_t row,col; char c;
while (ibuf >> row >> c >> col)
{
- assert(c == '-');
+ UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
+ << "Error in alignment information:\n" << a);
binwrite(obuf,row);
binwrite(obuf,col);
}
@@ -638,7 +490,6 @@ namespace Moses
cout << string(90,'-') << endl;
}
-
PhraseOrientation
find_po_fwd(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
@@ -653,13 +504,13 @@ namespace Moses
ushort ns1,ne1,ne2;
if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2))
- {
- return po_other;
- }
+ return po_other;
+
if (ns1 >= e1)
{
for (ushort j = e1; j < ns1; ++j)
- if (a1[j].size()) return po_jfwd;
+ if (a1[j].size())
+ return po_jfwd;
return po_mono;
}
else
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 84c3713ac..bc7c75c07 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -15,6 +15,10 @@
//
// - use multiple agendas for better load balancing and to avoid
// competition for locks
+//
+
+
+#define UG_BITEXT_TRACK_ACTIVE_THREADS 0
#include <string>
#include <vector>
@@ -26,6 +30,7 @@
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#include <boost/random.hpp>
+#include <boost/format.hpp>
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
@@ -46,6 +51,8 @@
#include "ug_corpus_token.h"
#include "tpt_pickler.h"
#include "ug_lexical_phrase_scorer2.h"
+#include "ug_phrasepair.h"
+#include "ug_lru_cache.h"
#define PSTATS_CACHE_THRESHOLD 50
@@ -55,6 +62,8 @@ namespace Moses {
class Mmsapt;
namespace bitext
{
+ template<typename TKN> class Bitext;
+ template<typename TKN> class PhrasePair;
using namespace ugdiss;
template<typename TKN> class Bitext;
@@ -84,11 +93,11 @@ namespace Moses {
template<typename sid_t, typename off_t, typename len_t>
void
- parse_pid(uint64_t const pid, sid_t & sid,
+ parse_pid(::uint64_t const pid, sid_t & sid,
off_t & off, len_t& len)
{
- static uint64_t two32 = uint64_t(1)<<32;
- static uint64_t two16 = uint64_t(1)<<16;
+ static ::uint64_t two32 = ::uint64_t(1)<<32;
+ static ::uint64_t two16 = ::uint64_t(1)<<16;
len = pid%two16;
off = (pid%two32)>>16;
sid = pid>>32;
@@ -119,6 +128,7 @@ namespace Moses {
void add(float w, vector<uchar> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient);
void invalidate();
+ void validate();
bool valid();
uint32_t dcnt_fwd(PhraseOrientation const idx) const;
uint32_t dcnt_bwd(PhraseOrientation const idx) const;
@@ -127,7 +137,10 @@ namespace Moses {
struct
pstats
{
+
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
static ThreadSafeCounter active;
+#endif
boost::mutex lock; // for parallel gathering of stats
boost::condition_variable ready; // consumers can wait for this data structure to be ready.
@@ -139,8 +152,8 @@ namespace Moses {
uint32_t ofwd[po_other+1], obwd[po_other+1];
- // typedef typename boost::unordered_map<uint64_t, jstats> trg_map_t;
- typedef typename std::map<uint64_t, jstats> trg_map_t;
+ // typedef typename boost::unordered_map<typename ::uint64_t, jstats> trg_map_t;
+ typedef std::map<typename ::uint64_t, jstats> trg_map_t;
trg_map_t trg;
pstats();
~pstats();
@@ -149,283 +162,253 @@ namespace Moses {
size_t count_workers() { return in_progress; }
bool
- add(uint64_t const pid,
+ add(::uint64_t const pid,
float const w,
vector<uchar> const& a,
uint32_t const cnt2,
uint32_t fwd_o, uint32_t bwd_o);
};
+
+ template<typename Token>
+ string
+ toString(TokenIndex const& V, Token const* x, size_t const len)
+ {
+ if (!len) return "";
+ UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
+ ostringstream buf;
+ buf << V[x->id()];
+ size_t i = 1;
+ for (x = x->next(); x && i < len; ++i, x = x->next())
+ buf << " " << V[x->id()];
+ UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
+ return buf.str();
+ }
+
+ template<typename Token>
class
PhrasePair
{
public:
- uint64_t p1, p2;
+ class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; };
+ Token const* start1;
+ Token const* start2;
+ uint32_t len1;
+ uint32_t len2;
+ ::uint64_t p1, p2;
uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
vector<float> fvals;
- float dfwd[po_other+1];
- float dbwd[po_other+1];
+ float dfwd[po_other+1]; // distortion counts // counts or probs?
+ float dbwd[po_other+1]; // distortion counts
vector<uchar> aln;
- // float avlex12,avlex21; // average lexical probs (Moses std)
- // float znlex1,znlex2; // zens-ney lexical smoothing
- // float colex1,colex2; // based on raw lexical occurrences
float score;
- PhrasePair();
+ bool inverse;
+ PhrasePair() { };
PhrasePair(PhrasePair const& o);
+
+ PhrasePair const& operator+=(PhrasePair const& other);
+
bool operator<(PhrasePair const& other) const;
bool operator>(PhrasePair const& other) const;
- bool operator<=(PhrasePair const& other) const;
+ bool operator<=(PhrasePair const& other) const;
bool operator>=(PhrasePair const& other) const;
- void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
- void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
- size_t const numfeats);
+ void init();
+ void init(::uint64_t const pid1, bool is_inverse,
+ Token const* x, uint32_t const len,
+ pstats const* ps = NULL, size_t const numfeats=0);
+
+ // void init(::uint64_t const pid1, pstats const& ps, size_t const numfeats);
+ // void init(::uint64_t const pid1, pstats const& ps1, pstats const& ps2,
+ // size_t const numfeats);
- PhrasePair const&
- update(uint64_t const pid2, jstats const& js);
+ // PhrasePair const&
+ // update(::uint64_t const pid2, size_t r2 = 0);
PhrasePair const&
- update(uint64_t const pid2, jstats const& js1, jstats const& js2);
+ update(::uint64_t const pid2, Token const* x,
+ uint32_t const len, jstats const& js);
+
+ // PhrasePair const&
+ // update(::uint64_t const pid2, jstats const& js1, jstats const& js2);
- PhrasePair const&
- update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
+ // PhrasePair const&
+ // update(::uint64_t const pid2, size_t const raw2extra, jstats const& js);
- float eval(vector<float> const& w);
+ // float
+ // eval(vector<float> const& w);
+
+ class SortByTargetIdSeq
+ {
+ public:
+ int cmp(PhrasePair const& a, PhrasePair const& b) const;
+ bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+ };
};
template<typename Token>
- class
- PhraseScorer
+ void
+ PhrasePair<Token>::
+ init(::uint64_t const pid1, bool is_inverse, Token const* x, uint32_t const len,
+ pstats const* ps, size_t const numfeats)
{
- protected:
- int index;
- int num_feats;
- public:
-
- virtual
- void
- operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest)
- const = 0;
-
- int
- fcnt() const
- { return num_feats; }
-
- int
- getIndex() const
- { return index; }
- };
+ inverse = is_inverse;
+ start1 = x; len1 = len;
+ p1 = pid1;
+ p2 = 0;
+ if (ps)
+ {
+ raw1 = ps->raw_cnt;
+ sample1 = ps->sample_cnt;
+ good1 = ps->good;
+ }
+ else raw1 = sample1 = good1 = 0;
+ joint = 0;
+ good2 = 0;
+ sample2 = 0;
+ raw2 = 0;
+ fvals.resize(numfeats);
+ }
template<typename Token>
- class
- PScorePfwd : public PhraseScorer<Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>::
+ update(::uint64_t const pid2,
+ Token const* x, uint32_t const len, jstats const& js)
{
- float conf;
- char denom;
- public:
- PScorePfwd()
- {
- this->num_feats = 1;
- }
+ p2 = pid2;
+ start2 = x; len2 = len;
+ raw2 = js.cnt2();
+ joint = js.rcnt();
+ assert(js.aln().size());
+ if (js.aln().size())
+ aln = js.aln()[0].second;
+ float total_fwd = 0, total_bwd = 0;
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ total_fwd += js.dcnt_fwd(po)+1;
+ total_bwd += js.dcnt_bwd(po)+1;
+ }
- int
- init(int const i, float const c, char d=0)
- {
- conf = c;
- denom = d;
- this->index = i;
- return i + this->num_feats;
- }
+ // should we do that here or leave the raw counts?
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
+ dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
+ }
- void
- operator()(Bitext<Token> const& bt,
- PhrasePair & pp,
- vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- if (pp.joint > pp.good1)
- {
- cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
- cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
- }
- switch (denom)
- {
- case 'g':
- (*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
- break;
- case 's':
- (*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf));
- break;
- case 'r':
- (*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf));
- }
- }
- };
+ return *this;
+ }
template<typename Token>
- class
- PScorePbwd : public PhraseScorer<Token>
- {
- float conf;
- public:
- PScorePbwd()
- {
- this->num_feats = 1;
- }
+ bool
+ PhrasePair<Token>::
+ operator<(PhrasePair const& other) const
+ { return this->score < other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator>(PhrasePair const& other) const
+ { return this->score > other.score; }
- int
- init(int const i, float const c)
- {
- conf = c;
- this->index = i;
- return i + this->num_feats;
- }
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator<=(PhrasePair const& other) const
+ { return this->score <= other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator>=(PhrasePair const& other) const
+ { return this->score >= other.score; }
- void
- operator()(Bitext<Token> const& bt, PhrasePair& pp,
- vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- (*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint),pp.joint,conf));
- }
- };
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>::
+ operator+=(PhrasePair const& o)
+ {
+ raw1 += o.raw1;
+ raw2 += o.raw2;
+ sample1 += o.sample1;
+ sample2 += o.sample2;
+ good1 += o.good1;
+ good2 += o.good2;
+ joint += o.joint;
+ return *this;
+ }
template<typename Token>
- class
- PScoreLogCounts : public PhraseScorer<Token>
+ PhrasePair<Token>::
+ PhrasePair(PhrasePair<Token> const& o)
+ : start1(o.start1)
+ , start2(o.start2)
+ , len1(o.len1)
+ , len2(o.len2)
+ , p1(o.p1)
+ , p2(o.p2)
+ , raw1(o.raw1)
+ , raw2(o.raw2)
+ , sample1(o.sample1)
+ , sample2(o.sample2)
+ , good1(o.good1)
+ , good2(o.good2)
+ , joint(o.joint)
+ , fvals(o.fvals)
+ , aln(o.aln)
+ , score(o.score)
+ , inverse(o.inverse)
{
- float conf;
- public:
- PScoreLogCounts()
- {
- this->num_feats = 4;
- }
-
- int
- init(int const i)
- {
- this->index = i;
- return i + this->num_feats;
- }
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair& pp,
- vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- size_t i = this->index;
- assert(pp.raw1);
- assert(pp.sample1);
- assert(pp.joint);
- assert(pp.raw2);
- (*dest)[i] = log(pp.raw1);
- (*dest)[++i] = log(pp.sample1);
- (*dest)[++i] = log(pp.joint);
- (*dest)[++i] = log(pp.raw2);
- }
- };
-
+ for (size_t i = 0; i <= po_other; ++i)
+ {
+ dfwd[i] = o.dfwd[i];
+ dbwd[i] = o.dbwd[i];
+ }
+ }
+
template<typename Token>
- class
- PScoreLex : public PhraseScorer<Token>
+ int
+ PhrasePair<Token>::
+ SortByTargetIdSeq::
+ cmp(PhrasePair const& a, PhrasePair const& b) const
{
- public:
- LexicalPhraseScorer2<Token> scorer;
-
- PScoreLex() { this->num_feats = 2; }
-
- int
- init(int const i, string const& fname)
- {
- scorer.open(fname);
- this->index = i;
- return i + this->num_feats;
- }
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
- parse_pid(pp.p1, sid1, off1, len1);
- parse_pid(pp.p2, sid2, off2, len2);
-
-#if 0
- cout << len1 << " " << len2 << endl;
- Token const* t1 = bt.T1->sntStart(sid1);
- for (size_t i = off1; i < off1 + len1; ++i)
- cout << (*bt.V1)[t1[i].id()] << " ";
- cout << __FILE__ << ":" << __LINE__ << endl;
-
- Token const* t2 = bt.T2->sntStart(sid2);
- for (size_t i = off2; i < off2 + len2; ++i)
- cout << (*bt.V2)[t2[i].id()] << " ";
- cout << __FILE__ << ":" << __LINE__ << endl;
-
- BOOST_FOREACH (int a, pp.aln)
- cout << a << " " ;
- cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
-
-#endif
- scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
- bt.T2->sntStart(sid2)+off2,0,len2,
- pp.aln, (*dest)[this->index],
- (*dest)[this->index+1]);
- }
-
- };
-
- /// Word penalty
+ size_t i = 0;
+ Token const* x = a.start2;
+ Token const* y = b.start2;
+ while (i < a.len2 && i < b.len2 && x->id() == y->id())
+ {
+ x = x->next();
+ y = y->next();
+ ++i;
+ }
+ if (i == a.len2 && i == b.len2) return 0;
+ if (i == a.len2) return -1;
+ if (i == b.len2) return 1;
+ return x->id() < y->id() ? -1 : 1;
+ }
+
template<typename Token>
- class
- PScoreWP : public PhraseScorer<Token>
+ bool
+ PhrasePair<Token>::
+ SortByTargetIdSeq::
+ operator()(PhrasePair const& a, PhrasePair const& b) const
{
- public:
-
- PScoreWP() { this->num_feats = 1; }
-
- int
- init(int const i)
- {
- this->index = i;
- return i + this->num_feats;
- }
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- uint32_t sid2=0,off2=0,len2=0;
- parse_pid(pp.p2, sid2, off2, len2);
- (*dest)[this->index] = len2;
- }
-
- };
+ return this->cmp(a,b) < 0;
+ }
- /// Phrase penalty
template<typename Token>
- class
- PScorePP : public PhraseScorer<Token>
+ void
+ PhrasePair<Token>::
+ init()
{
- public:
-
- PScorePP() { this->num_feats = 1; }
-
- int
- init(int const i)
- {
- this->index = i;
- return i + this->num_feats;
- }
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- (*dest)[this->index] = 1;
- }
-
- };
+ inverse = false;
+ len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ start1 = start2 = NULL;
+ p1 = p2 = 0;
+ }
template<typename TKN>
class Bitext
@@ -471,18 +454,24 @@ namespace Moses {
bool const flip) const;
#if 1
- typedef boost::unordered_map<uint64_t,sptr<pstats> > pcache_t;
+ typedef boost::unordered_map<typename ::uint64_t,sptr<pstats> > pcache_t;
#else
- typedef map<uint64_t,sptr<pstats> > pcache_t;
+ typedef map<typename ::uint64_t,sptr<pstats> > pcache_t;
#endif
mutable pcache_t cache1,cache2;
protected:
+ typedef typename
+ lru_cache::LRU_Cache<typename ::uint64_t, vector<PhrasePair<Token> > >
+ pplist_cache_t;
+
size_t default_sample_size;
size_t num_workers;
size_t m_pstats_cache_threshold;
+ mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
private:
sptr<pstats>
- prep2(iter const& phrase, size_t const max_sample) const;
+ prep2(iter const& phrase, size_t const max_sample,
+ vector<float> const* const bias) const;
public:
Bitext(size_t const max_sample =1000,
size_t const xnum_workers =16);
@@ -500,14 +489,24 @@ namespace Moses {
virtual void open(string const base, string const L1, string const L2) = 0;
// sptr<pstats> lookup(Phrase const& phrase, size_t factor) const;
- sptr<pstats> lookup(iter const& phrase) const;
- sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
- void prep(iter const& phrase) const;
+ sptr<pstats> lookup(iter const& phrase,vector<float> const* const bias=NULL) const;
+ sptr<pstats> lookup(iter const& phrase, size_t const max_sample,
+ vector<float> const* const bias) const;
+
+ void
+ lookup(vector<Token> const& snt, TSA<Token>& idx,
+ vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
+ vector<vector<typename ::uint64_t> >* pidmap = NULL,
+ typename PhrasePair<Token>::Scorer* scorer=NULL,
+ vector<float> const* const bias=NULL,
+ bool multithread=true) const;
+
+ void prep(iter const& phrase, vector<float> const* const bias) const;
void setDefaultSampleSize(size_t const max_samples);
size_t getDefaultSampleSize() const;
- string toString(uint64_t pid, int isL2) const;
+ string toString(::uint64_t pid, int isL2) const;
virtual size_t revision() const { return 0; }
};
@@ -515,7 +514,7 @@ namespace Moses {
template<typename Token>
string
Bitext<Token>::
- toString(uint64_t pid, int isL2) const
+ toString(::uint64_t pid, int isL2) const
{
ostringstream buf;
uint32_t sid,off,len; parse_pid(pid,sid,off,len);
@@ -587,11 +586,14 @@ namespace Moses {
boost::mutex lock;
class job
{
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
static ThreadSafeCounter active;
+#endif
boost::mutex lock;
friend class agenda;
- boost::taus88 rnd; // every job has its own pseudo random generator
- double rnddenom; // denominator for scaling random sampling
+ boost::taus88 rnd; // every job has its own pseudo random generator
+ double rnddenom; // denominator for scaling random sampling
+ size_t min_diverse; // minimum number of distinct translations
public:
size_t workers; // how many workers are working on this job?
sptr<TSA<Token> const> root; // root of the underlying suffix array
@@ -604,10 +606,13 @@ namespace Moses {
size_t len; // phrase length
bool fwd; // if true, source phrase is L1
sptr<pstats> stats; // stores statistics collected during sampling
- bool step(uint64_t & sid, uint64_t & offset); // select another occurrence
+ vector<float> const* bias; // sentence-level bias for sampling
+ float bias_total;
+ bool step(::uint64_t & sid, ::uint64_t & offset); // select another occurrence
bool done() const;
job(typename TSA<Token>::tree_iterator const& m,
- sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd);
+ sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
+ vector<float> const* const bias);
~job();
};
public:
@@ -632,7 +637,9 @@ namespace Moses {
sptr<pstats>
add_job(typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples);
+ size_t const max_samples,
+ vector<float> const* const bias);
+
sptr<job> get_job();
};
@@ -641,37 +648,60 @@ namespace Moses {
Bitext<Token>::
agenda::
job::
- step(uint64_t & sid, uint64_t & offset)
+ step(::uint64_t & sid, ::uint64_t & offset)
{
boost::lock_guard<boost::mutex> jguard(lock);
- if ((max_samples == 0) && (next < stop))
+ bool ret = (max_samples == 0) && (next < stop);
+ if (ret)
{
next = root->readSid(next,stop,sid);
next = root->readOffset(next,stop,offset);
boost::lock_guard<boost::mutex> sguard(stats->lock);
if (stats->raw_cnt == ctr) ++stats->raw_cnt;
+ if (bias && bias->at(sid) == 0)
+ return false;
stats->sample_cnt++;
- return true;
}
else
{
- while (next < stop && stats->good < max_samples)
+ while (next < stop && (stats->good < max_samples ||
+ stats->trg.size() < min_diverse))
{
next = root->readSid(next,stop,sid);
next = root->readOffset(next,stop,offset);
- {
- boost::lock_guard<boost::mutex> sguard(stats->lock);
+ { // brackets required for lock scoping; see sguard immediately below
+ boost::lock_guard<boost::mutex> sguard(stats->lock);
if (stats->raw_cnt == ctr) ++stats->raw_cnt;
- size_t rnum = (stats->raw_cnt - ctr++)*(rnd()/(rnd.max()+1.));
- if (rnum < max_samples - stats->good)
+ size_t scalefac = (stats->raw_cnt - ctr++);
+ size_t rnum = scalefac * (rnd()/(rnd.max()+1.));
+ size_t th = (bias_total
+ ? bias->at(sid)/bias_total * bias->size() * max_samples
+ : max_samples);
+#if 0
+ cerr << rnum << "/" << scalefac << " vs. "
+ << max_samples - stats->good << " ("
+ << max_samples << " - " << stats->good << ")"
+ << " th=" << th;
+ if (bias)
+ cerr << " with bias " << bias->at(sid)
+ << " => " << bias->at(sid) * bias->size();
+ else cerr << " without bias";
+ cerr << endl;
+#endif
+ if (rnum + stats->good < th)
{
stats->sample_cnt++;
- return true;
+ ret = true;
+ break;
}
}
}
- return false;
}
+
+ // boost::lock_guard<boost::mutex> sguard(stats->lock);
+ // abuse of lock for clean output to cerr
+ // cerr << stats->sample_cnt++;
+ return ret;
}
template<typename Token>
@@ -713,8 +743,15 @@ namespace Moses {
worker::
operator()()
{
+ // things to do:
+ // - have each worker maintain their own pstats object and merge results at the end;
+ // - ensure the minimum size of samples considered by a non-locked counter that is only
+ // ever incremented -- who cares if we look at more samples than required, as long
+ // as we look at at least the minimum required
+ // This way, we can reduce the number of lock / unlock operations we need to do during
+ // sampling.
size_t s1=0, s2=0, e1=0, e2=0;
- uint64_t sid=0, offset=0; // of the source phrase
+ ::uint64_t sid=0, offset=0; // of the source phrase
while(sptr<job> j = ag.get_job())
{
j->stats->register_worker();
@@ -733,7 +770,7 @@ namespace Moses {
}
else if (!ag.bt.find_trg_phr_bounds
(sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
- NULL,NULL,true))
+ &aln,NULL,true)) // NULL,NULL,true))
continue;
j->stats->lock.lock();
j->stats->good += 1;
@@ -741,10 +778,21 @@ namespace Moses {
++j->stats->ofwd[po_fwd];
++j->stats->obwd[po_bwd];
j->stats->lock.unlock();
- for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
+ // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
+ for (size_t k = 1; k < aln.size(); k += 2)
aln[k] += s2 - s1;
Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
float sample_weight = 1./((s2-s1+1)*(e2-e1+1));
+
+ vector<typename ::uint64_t> seen;
+ seen.reserve(100);
+ // It is possible that the phrase extraction extracts the same
+ // phrase twice, e.g., when word a co-occurs with sequence b b b
+ // but is aligned only to the middle word. We can only count
+ // each phrase pair once per source phrase occurrence, or else
+ // run the risk of having more joint counts than marginal
+ // counts.
+
for (size_t s = s1; s <= s2; ++s)
{
sptr<iter> b = (j->fwd ? ag.bt.I2 : ag.bt.I1)->find(o+s,e1-s);
@@ -753,7 +801,26 @@ namespace Moses {
// assert(b);
for (size_t i = e1; i <= e2; ++i)
{
- if (! j->stats->add(b->getPid(),sample_weight,aln,
+ ::uint64_t tpid = b->getPid();
+ size_t s = 0;
+ while (s < seen.size() && seen[s] != tpid) ++s;
+ if (s < seen.size())
+ {
+#if 0
+ size_t sid, off, len;
+ parse_pid(tpid,sid,off,len);
+ cerr << "HA, gotcha! " << sid << ":" << off << " at " << HERE << endl;
+ for (size_t z = 0; z < len; ++z)
+ {
+ id_type tid = ag.bt.T2->sntStart(sid)[off+z].id();
+ cerr << (*ag.bt.V2)[tid] << " ";
+ }
+ cerr << endl;
+#endif
+ continue;
+ }
+ seen.push_back(tpid);
+ if (! j->stats->add(tpid,sample_weight,aln,
b->approxOccurrenceCount(),
po_fwd,po_bwd))
{
@@ -784,8 +851,10 @@ namespace Moses {
#endif
}
}
- if (j->fwd && s < s2)
- for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
+ // if (j->fwd && s < s2)
+ // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
+ if (s < s2)
+ for (size_t k = 1; k < aln.size(); k += 2)
--aln[k];
}
// j->stats->lock.unlock();
@@ -801,7 +870,10 @@ namespace Moses {
~job()
{
if (stats) stats.reset();
- --active;
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ try { --active; } catch (...) {}
+#endif
+ // counter may not exist any more at destruction time
}
template<typename Token>
@@ -809,9 +881,11 @@ namespace Moses {
agenda::
job::
job(typename TSA<Token>::tree_iterator const& m,
- sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
+ sptr<TSA<Token> > const& r, size_t maxsmpl,
+ bool isfwd, vector<float> const* const sntbias)
: rnd(0)
, rnddenom(rnd.max() + 1.)
+ , min_diverse(10)
, workers(0)
, root(r)
, next(m.lower_bound(-1))
@@ -820,12 +894,26 @@ namespace Moses {
, ctr(0)
, len(m.size())
, fwd(isfwd)
+ , bias(sntbias)
{
stats.reset(new pstats());
stats->raw_cnt = m.approxOccurrenceCount();
+ bias_total = 0; // needed for renormalization
+ if (bias)
+ {
+ for (char const* x = m.lower_bound(-1); x < stop;)
+ {
+ uint32_t sid; ushort offset;
+ next = root->readSid(next,stop,sid);
+ next = root->readOffset(next,stop,offset);
+ bias_total += bias->at(sid);
+ }
+ }
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
// if (++active%5 == 0)
++active;
// cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
+#endif
}
template<typename Token>
@@ -833,12 +921,12 @@ namespace Moses {
Bitext<Token>::
agenda::
add_job(typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples)
+ size_t const max_samples, vector<float> const* const bias)
{
boost::unique_lock<boost::mutex> lk(this->lock);
static boost::posix_time::time_duration nodelay(0,0,0,0);
bool fwd = phrase.root == bt.I1.get();
- sptr<job> j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd));
+ sptr<job> j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd, bias));
j->stats->register_worker();
joblist.push_back(j);
@@ -1197,9 +1285,18 @@ namespace Moses {
assert(T2);
assert(Tx);
- bitvector forbidden((flip ? T1 : T2)->sntLen(sid));
- size_t slen1 = (*T1).sntLen(sid);
- size_t slen2 = (*T2).sntLen(sid);
+ size_t slen1,slen2;
+ if (flip)
+ {
+ slen1 = T2->sntLen(sid);
+ slen2 = T1->sntLen(sid);
+ }
+ else
+ {
+ slen1 = T1->sntLen(sid);
+ slen2 = T2->sntLen(sid);
+ }
+ bitvector forbidden(slen2);
if (full_alignment)
{
if (slen1*slen2 > full_alignment->size())
@@ -1218,17 +1315,11 @@ namespace Moses {
if (flip) { p = binread(p,trg); assert(p<x); p = binread(p,src); }
else { p = binread(p,src); assert(p<x); p = binread(p,trg); }
- // cerr << sid << " " << src << "/" << slen1 << " " << trg << "/"
- // << slen2 << endl;
- if (src >= slen1 || trg >= slen2)
- {
- ostringstream buf;
- buf << "Alignment range error at sentence " << sid << "!" << endl
- << src << "/" << slen1 << " " << trg << "/" << slen2 << endl;
- cerr << buf.str() << endl;
- UTIL_THROW(util::Exception, buf.str().c_str());
- }
-
+ UTIL_THROW_IF2((src >= slen1 || trg >= slen2),
+ "Alignment range error at sentence " << sid << "!\n"
+ << src << "/" << slen1 << " " <<
+ trg << "/" << slen2);
+
if (src < start || src >= stop)
forbidden.set(trg);
else
@@ -1238,22 +1329,11 @@ namespace Moses {
}
if (core_alignment)
{
- if (flip)
- {
- aln1[trg].push_back(src);
- aln2[src].push_back(trg);
- }
- else
- {
- aln1[src].push_back(trg);
- aln2[trg].push_back(src);
- }
+ aln1[src].push_back(trg);
+ aln2[trg].push_back(src);
}
if (full_alignment)
- {
- if (flip) full_alignment->set(trg*slen2 + src);
- else full_alignment->set(src*slen2 + trg);
- }
+ full_alignment->set(src*slen2 + trg);
}
for (size_t i = lft; i <= rgt; ++i)
@@ -1267,67 +1347,17 @@ namespace Moses {
if (core_alignment)
{
core_alignment->clear();
- if (flip)
- {
- for (size_t i = lft; i <= rgt; ++i)
- {
- sort(aln1[i].begin(),aln1[i].end());
- BOOST_FOREACH(ushort x, aln1[i])
- {
- core_alignment->push_back(i-lft);
- core_alignment->push_back(x-start);
- }
- }
- }
- else
+ for (size_t i = start; i < stop; ++i)
{
- for (size_t i = start; i < stop; ++i)
+ BOOST_FOREACH(ushort x, aln1[i])
{
- BOOST_FOREACH(ushort x, aln1[i])
- {
- core_alignment->push_back(i-start);
- core_alignment->push_back(x-lft);
- }
+ core_alignment->push_back(i-start);
+ core_alignment->push_back(x-lft);
}
}
-
// now determine fwd and bwd phrase orientation
- if (flip)
- {
- po_fwd = find_po_fwd(aln2,aln1,start,stop,s1,e2);
- po_bwd = find_po_bwd(aln2,aln1,start,stop,s1,e2);
- }
- else
- {
- po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
- po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
- }
-#if 0
- // if (e1 - s1 > 3)
- {
- lock_guard<mutex> guard(this->lock);
- Token const* t1 = T1->sntStart(sid);
- Token const* t2 = T2->sntStart(sid);
- cout << "[" << start << ":" << stop << "] => ["
- << s1 << ":" << s2 << ":"
- << e1 << ":" << e2 << "]" << endl;
- for (size_t k = start; k < stop; ++k)
- cout << k-start << "." << (*V1)[t1[k].id()] << " ";
- cout << endl;
- for (size_t k = s1; k < e2;)
- {
- if (k == s2) cout << "[";
- cout << int(k)-int(s2) << "." << (*V2)[t2[k].id()];
- if (++k == e1) cout << "] ";
- else cout << " ";
- }
- cout << endl;
- for (size_t k = 0; k < core_alignment->size(); k += 2)
- cout << int((*core_alignment)[k]) << "-" << int((*core_alignment)[k+1]) << " ";
- cout << "\n" << __FILE__ << ":" << __LINE__ << endl;
-
- }
-#endif
+ po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
+ po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
}
return lft <= rgt;
}
@@ -1335,15 +1365,16 @@ namespace Moses {
template<typename Token>
void
Bitext<Token>::
- prep(iter const& phrase) const
+ prep(iter const& phrase, vector<float> const* const bias) const
{
- prep2(phrase, this->default_sample_size);
+ prep2(phrase, this->default_sample_size,bias);
}
template<typename Token>
sptr<pstats>
Bitext<Token>::
- prep2(iter const& phrase, size_t const max_sample) const
+ prep2(iter const& phrase, size_t const max_sample,
+ vector<float> const* const bias) const
{
boost::lock_guard<boost::mutex> guard(this->lock);
if (!ag)
@@ -1356,13 +1387,14 @@ namespace Moses {
#if 1
// use pcache only for plain sentence input
if (StaticData::Instance().GetInputType() == SentenceInput &&
- max_sample == this->default_sample_size &&
+ max_sample == this->default_sample_size && bias == NULL &&
phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
{
- // need to test what a good caching threshold is
+ // still need to test what a good caching threshold is
// is caching here the cause of the apparent memory leak in
- // confusion network decoding ????
- uint64_t pid = phrase.getPid();
+ // confusion network decoding ???? No, it isn't.
+ // That was because of naive, brute-force input path generation.
+ ::uint64_t pid = phrase.getPid();
pcache_t & cache(phrase.root == &(*this->I1) ? cache1 : cache2);
pcache_t::value_type entry(pid,sptr<pstats>());
pair<pcache_t::iterator,bool> foo;
@@ -1372,7 +1404,7 @@ namespace Moses {
// cerr << "NEW FREQUENT PHRASE: "
// << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount()
// << " at " << __FILE__ << ":" << __LINE__ << endl;
- foo.first->second = ag->add_job(phrase, max_sample);
+ foo.first->second = ag->add_job(phrase, max_sample,NULL);
assert(foo.first->second);
}
assert(foo.first->second);
@@ -1381,17 +1413,135 @@ namespace Moses {
}
else
#endif
- ret = ag->add_job(phrase, max_sample);
+ ret = ag->add_job(phrase, max_sample,bias);
assert(ret);
return ret;
}
+ // worker for scoring and sorting phrase table entries in parallel
+ template<typename Token>
+ class pstats2pplist
+ {
+ Ttrack<Token> const& m_other;
+ sptr<pstats> m_pstats;
+ vector<PhrasePair<Token> >& m_pplist;
+ typename PhrasePair<Token>::Scorer const* m_scorer;
+ PhrasePair<Token> m_pp;
+ Token const* m_token;
+ size_t m_len;
+ ::uint64_t m_pid1;
+ bool m_is_inverse;
+ public:
+
+ // CONSTRUCTOR
+ pstats2pplist(typename TSA<Token>::tree_iterator const& m,
+ Ttrack<Token> const& other,
+ sptr<pstats> const& ps,
+ vector<PhrasePair<Token> >& dest,
+ typename PhrasePair<Token>::Scorer const* scorer)
+ : m_other(other)
+ , m_pstats(ps)
+ , m_pplist(dest)
+ , m_scorer(scorer)
+ , m_token(m.getToken(0))
+ , m_len(m.size())
+ , m_pid1(m.getPid())
+ , m_is_inverse(false)
+ { }
+
+ // WORKER
+ void
+ operator()()
+ {
+ // wait till all statistics have been collected
+ boost::unique_lock<boost::mutex> lock(m_pstats->lock);
+ while (m_pstats->in_progress)
+ m_pstats->ready.wait(lock);
+
+ m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0);
+
+ // convert pstats entries to phrase pairs
+ pstats::trg_map_t::iterator a;
+ for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a)
+ {
+ uint32_t sid,off,len;
+ parse_pid(a->first, sid, off, len);
+ m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
+ m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),m_pp.joint);
+ size_t J = m_pp.joint<<7; // hard coded threshold of 1/128
+ if (m_pp.good1 > J || m_pp.good2 > J) continue;
+ if (m_scorer)
+ {
+ (*m_scorer)(m_pp);
+ }
+ m_pplist.push_back(m_pp);
+ }
+ greater<PhrasePair<Token> > sorter;
+ if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter);
+ }
+ };
+
+ template<typename Token>
+ void
+ Bitext<Token>::
+ lookup(vector<Token> const& snt, TSA<Token>& idx,
+ vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
+ vector<vector<typename ::uint64_t> >* pidmap,
+ typename PhrasePair<Token>::Scorer* scorer,
+ vector<float> const* const bias, bool multithread) const
+ {
+ // typedef vector<vector<sptr<vector<PhrasePair<Token> > > > > ret_t;
+
+ dest.clear();
+ dest.resize(snt.size());
+ if (pidmap) { pidmap->clear(); pidmap->resize(snt.size()); }
+
+ // collect statistics in parallel, then build PT entries as
+ // the sampling finishes
+ bool fwd = &idx == I1.get();
+ vector<boost::thread*> workers; // background threads doing the lookup
+ pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2);
+ if (C.capacity() < 100000) C.reserve(100000);
+ for (size_t i = 0; i < snt.size(); ++i)
+ {
+ dest[i].reserve(snt.size()-i);
+ typename TSA<Token>::tree_iterator m(&idx);
+ for (size_t k = i; k < snt.size() && m.extend(snt[k].id()); ++k)
+ {
+ ::uint64_t key = m.getPid();
+ if (pidmap) (*pidmap)[i].push_back(key);
+ sptr<vector<PhrasePair<Token> > > pp = C.get(key);
+ if (pp)
+ dest[i].push_back(pp);
+ else
+ {
+ pp.reset(new vector<PhrasePair<Token> >());
+ C.set(key,pp);
+ dest[i].push_back(pp);
+ sptr<pstats> x = prep2(m, this->default_sample_size,bias);
+ pstats2pplist<Token> w(m,*(fwd?T2:T1),x,*pp,scorer);
+ if (multithread)
+ {
+ boost::thread* t = new boost::thread(w);
+ workers.push_back(t);
+ }
+ else w();
+ }
+ }
+ }
+ for (size_t w = 0; w < workers.size(); ++w)
+ {
+ workers[w]->join();
+ delete workers[w];
+ }
+ }
+
template<typename Token>
sptr<pstats>
Bitext<Token>::
- lookup(iter const& phrase) const
+ lookup(iter const& phrase, vector<float> const* const bias) const
{
- sptr<pstats> ret = prep2(phrase, this->default_sample_size);
+ sptr<pstats> ret = prep2(phrase, this->default_sample_size, bias);
assert(ret);
boost::lock_guard<boost::mutex> guard(this->lock);
if (this->num_workers <= 1)
@@ -1408,7 +1558,8 @@ namespace Moses {
template<typename Token>
sptr<pstats>
Bitext<Token>::
- lookup(iter const& phrase, size_t const max_sample) const
+ lookup(iter const& phrase, size_t const max_sample,
+ vector<float> const* const bias) const
{
sptr<pstats> ret = prep2(phrase, max_sample);
boost::lock_guard<boost::mutex> guard(this->lock);
@@ -1452,12 +1603,44 @@ namespace Moses {
return (max_samples && stats->good >= max_samples) || next == stop;
}
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
template<typename TKN>
ThreadSafeCounter
Bitext<TKN>::
agenda::
job::active;
+#endif
+ template<typename Token>
+ void
+ expand(typename Bitext<Token>::iter const& m,
+ Bitext<Token> const& bt,
+ pstats const& ps, vector<PhrasePair<Token> >& dest)
+ {
+ bool fwd = m.root == bt.I1.get();
+ dest.reserve(ps.trg.size());
+ PhrasePair<Token> pp;
+ pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0);
+ // cout << HERE << " " << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << endl;
+ pstats::trg_map_t::const_iterator a;
+ for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
+ {
+ uint32_t sid,off,len;
+ parse_pid(a->first, sid, off, len);
+ pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off,
+ len, a->second);
+ dest.push_back(pp);
+ }
+#if 0
+ typename PhrasePair<Token>::SortByTargetIdSeq sorter;
+ sort(dest.begin(), dest.end(),sorter);
+ BOOST_FOREACH(PhrasePair<Token> const& p, dest)
+ cout << toString (*(fwd ? bt.V1 : bt.V2),p.start1,p.len1) << " ::: "
+ << toString (*(fwd ? bt.V2 : bt.V1),p.start2,p.len2) << " "
+ << p.joint << endl;
+#endif
+ }
+
} // end of namespace bitext
} // end of namespace moses
#endif
diff --git a/moses/TranslationModel/UG/mm/ug_im_tsa.h b/moses/TranslationModel/UG/mm/ug_im_tsa.h
index 1de45d877..f7256ba2d 100644
--- a/moses/TranslationModel/UG/mm/ug_im_tsa.h
+++ b/moses/TranslationModel/UG/mm/ug_im_tsa.h
@@ -52,12 +52,12 @@ namespace ugdiss
public:
imTSA();
- imTSA(shared_ptr<Ttrack<TOKEN> const> c,
+ imTSA(boost::shared_ptr<Ttrack<TOKEN> const> c,
bdBitset const* filt,
ostream* log = NULL);
imTSA(imTSA<TOKEN> const& prior,
- shared_ptr<imTtrack<TOKEN> const> const& crp,
+ boost::shared_ptr<imTtrack<TOKEN> const> const& crp,
vector<id_type> const& newsids, size_t const vsize);
count_type
@@ -74,13 +74,13 @@ namespace ugdiss
readSid(char const* p, char const* q, id_type& sid) const;
char const*
- readSid(char const* p, char const* q, uint64_t& sid) const;
+ readSid(char const* p, char const* q, ::uint64_t& sid) const;
char const*
readOffset(char const* p, char const* q, uint16_t& offset) const;
char const*
- readOffset(char const* p, char const* q, uint64_t& offset) const;
+ readOffset(char const* p, char const* q, ::uint64_t& offset) const;
void
sanityCheck() const;
@@ -140,7 +140,7 @@ namespace ugdiss
// specified in filter
template<typename TOKEN>
imTSA<TOKEN>::
- imTSA(shared_ptr<Ttrack<TOKEN> const> c, bdBitset const* filter, ostream* log)
+ imTSA(boost::shared_ptr<Ttrack<TOKEN> const> c, bdBitset const* filter, ostream* log)
{
assert(c);
this->corpus = c;
@@ -267,7 +267,7 @@ namespace ugdiss
template<typename TOKEN>
char const*
imTSA<TOKEN>::
- readSid(char const* p, char const* q, uint64_t& sid) const
+ readSid(char const* p, char const* q, ::uint64_t& sid) const
{
assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
@@ -289,7 +289,7 @@ namespace ugdiss
template<typename TOKEN>
char const*
imTSA<TOKEN>::
- readOffset(char const* p, char const* q, uint64_t& offset) const
+ readOffset(char const* p, char const* q, ::uint64_t& offset) const
{
assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
@@ -359,7 +359,7 @@ namespace ugdiss
template<typename TOKEN>
imTSA<TOKEN>::
imTSA(imTSA<TOKEN> const& prior,
- shared_ptr<imTtrack<TOKEN> const> const& crp,
+ boost::shared_ptr<imTtrack<TOKEN> const> const& crp,
vector<id_type> const& newsids, size_t const vsize)
{
typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(crp.get());
diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
index 05066c922..ac49ebcd4 100644
--- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
@@ -16,6 +16,9 @@
#include "tpt_tokenindex.h"
#include "ug_ttrack_base.h"
#include "tpt_tokenindex.h"
+#include "util/exception.hh"
+#include "moses/Util.h"
+
// #include "ug_vocab.h"
// define the corpus buffer size (in sentences) and the
@@ -49,10 +52,12 @@ namespace ugdiss
typename boost::shared_ptr<imTtrack<Token> >
append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt);
+ void m_check_token_count(); // debugging function
+
public:
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d);
- imTtrack(istream& in, TokenIndex const& V, ostream* log);
+ imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL);
imTtrack(size_t reserve = 0);
// imTtrack(istream& in, Vocab& V);
@@ -70,6 +75,22 @@ namespace ugdiss
};
template<typename Token>
+ void
+ imTtrack<Token>::
+ m_check_token_count()
+ { // sanity check
+ size_t check = 0;
+ BOOST_FOREACH(vector<Token> const& s, *myData)
+ check += s.size();
+ UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]"
+ << " Wrong token count after appending sentence!"
+ << " Counted " << check << " but expected "
+ << this->numToks << " in a total of " << myData->size()
+ << " sentences.");
+
+ }
+
+ template<typename Token>
Token const*
imTtrack<Token>::
sntStart(size_t sid) const // return pointer to beginning of sentence
@@ -110,10 +131,10 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
- imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL)
+ imTtrack(istream& in, TokenIndex const& V, ostream* log)
+ : numToks(0)
{
myData.reset(new vector<vector<Token> >());
- numToks = 0;
string line,w;
size_t linectr=0;
boost::unordered_map<string,id_type> H;
@@ -135,6 +156,7 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(size_t reserve)
+ : numToks(0)
{
myData.reset(new vector<vector<Token> >());
if (reserve) myData->reserve(reserve);
@@ -143,9 +165,9 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
+ : numToks(0)
{
myData = d;
- numToks = 0;
BOOST_FOREACH(vector<Token> const& v, *d)
numToks += v.size();
}
@@ -168,10 +190,13 @@ namespace ugdiss
/// add a sentence to the database
template<typename TOKEN>
- shared_ptr<imTtrack<TOKEN> >
- append(shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
+ boost::shared_ptr<imTtrack<TOKEN> >
+ append(boost::shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
{
- shared_ptr<imTtrack<TOKEN> > ret;
+#if 1
+ if (crp) crp->m_check_token_count();
+#endif
+ boost::shared_ptr<imTtrack<TOKEN> > ret;
if (crp == NULL)
{
ret.reset(new imTtrack<TOKEN>());
@@ -185,6 +210,11 @@ namespace ugdiss
}
else ret = crp;
ret->myData->push_back(snt);
+ ret->numToks += snt.size();
+
+#if 1
+ ret->m_check_token_count();
+#endif
return ret;
}
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
index 2d64705f7..b7e359223 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
@@ -2,6 +2,8 @@
// lexical phrase scorer, version 1
// written by Ulrich Germann
+// Is the +1 in computing the lexical probabilities taken from the original phrase-scoring code?
+
#ifndef __ug_lexical_phrase_scorer_h
#define __ug_lexical_phrase_scorer_h
@@ -11,6 +13,7 @@
#include <boost/unordered_map.hpp>
#include "tpt_pickler.h"
#include "ug_mm_2d_table.h"
+#include "util/exception.hh"
using namespace std;
namespace ugdiss
{
@@ -19,25 +22,27 @@ namespace ugdiss
class
LexicalPhraseScorer2
{
+ vector<string> ftag;
public:
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
table_t COOC;
void open(string const& fname);
-
template<typename someint>
void
score(TKN const* snt1, size_t const s1, size_t const e1,
TKN const* snt2, size_t const s2, size_t const e2,
- vector<someint> & aln, float & fwd_score, float& bwd_score) const;
+ vector<someint> const & aln, float const alpha,
+ float & fwd_score, float& bwd_score) const;
void
score(TKN const* snt1, size_t const s1, size_t const e1,
TKN const* snt2, size_t const s2, size_t const e2,
char const* const aln_start, char const* const aln_end,
- float & fwd_score, float& bwd_score) const;
+ float const alpha, float & fwd_score, float& bwd_score) const;
+
// plup: permissive lookup
- float plup_fwd(id_type const s,id_type const t) const;
- float plup_bwd(id_type const s,id_type const t) const;
+ float plup_fwd(id_type const s,id_type const t, float const alpha) const;
+ float plup_bwd(id_type const s,id_type const t, float const alpha) const;
// to be done:
// - on-the-fly smoothing ?
// - better (than permissive-lookup) treatment of unknown combinations
@@ -59,7 +64,8 @@ namespace ugdiss
LexicalPhraseScorer2<TKN>::
score(TKN const* snt1, size_t const s1, size_t const e1,
TKN const* snt2, size_t const s2, size_t const e2,
- vector<someint> & aln, float & fwd_score, float& bwd_score) const
+ vector<someint> const & aln, float const alpha,
+ float & fwd_score, float& bwd_score) const
{
vector<float> p1(e1,0), p2(e2,0);
vector<int> c1(e1,0), c2(e2,0);
@@ -68,9 +74,9 @@ namespace ugdiss
{
i1 = aln[k]; i2 = aln[++k];
if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
- p1[i1] += plup_fwd(snt1[i1].id(),snt2[i2].id());
+ p1[i1] += plup_fwd(snt1[i1].id(),snt2[i2].id(),alpha);
++c1[i1];
- p2[i2] += plup_bwd(snt1[i1].id(),snt2[i2].id());
+ p2[i2] += plup_bwd(snt1[i1].id(),snt2[i2].id(),alpha);
++c2[i2];
}
fwd_score = 0;
@@ -78,45 +84,62 @@ namespace ugdiss
{
if (c1[i] == 1) fwd_score += log(p1[i]);
else if (c1[i]) fwd_score += log(p1[i])-log(c1[i]);
- else fwd_score += log(plup_fwd(snt1[i].id(),0));
+ else fwd_score += log(plup_fwd(snt1[i].id(),0,alpha));
}
bwd_score = 0;
for (size_t i = s2; i < e2; ++i)
{
if (c2[i] == 1) bwd_score += log(p2[i]);
else if (c2[i]) bwd_score += log(p2[i])-log(c2[i]);
- else bwd_score += log(plup_bwd(0,snt2[i].id()));
+ else bwd_score += log(plup_bwd(0,snt2[i].id(),alpha));
}
}
template<typename TKN>
float
LexicalPhraseScorer2<TKN>::
- plup_fwd(id_type const s, id_type const t) const
+ plup_fwd(id_type const s, id_type const t, float const alpha) const
{
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
- // if (!COOC[s][t]) cout << s << " " << t << endl;
- // assert(COOC[s][t]);
- return float(COOC[s][t]+1)/(COOC.m1(s)+1);
- }
+ UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
+ << ": alpha parameter must be >= 0");
+ float ret = COOC[s][t]+alpha;
+ ret = (ret?ret:1.)/(COOC.m1(s)+alpha);
+ UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
+ << ": result not > 0 and <= 1. alpha = " << alpha << "; "
+ << COOC[s][t] << "/" << COOC.m1(s));
+#if 0
+ cerr << "[" << s << "," << t << "] "
+ << COOC.m1(s) << "/"
+ << COOC[s][t] << "/"
+ << COOC.m2(t) << endl;
+#endif
+ return ret;
+ }
+
template<typename TKN>
float
LexicalPhraseScorer2<TKN>::
- plup_bwd(id_type const s, id_type const t) const
+ plup_bwd(id_type const s, id_type const t,float const alpha) const
{
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
- // assert(COOC[s][t]);
- return float(COOC[s][t]+1)/(COOC.m2(t)+1);
+ UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
+ << ": alpha parameter must be >= 0");
+ float ret = float(COOC[s][t]+alpha);
+ ret = (ret?ret:1.)/(COOC.m2(t)+alpha);
+ UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
+ << ": result not > 0 and <= 1.");
+ return ret;
}
-
+
template<typename TKN>
void
LexicalPhraseScorer2<TKN>::
score(TKN const* snt1, size_t const s1, size_t const e1,
TKN const* snt2, size_t const s2, size_t const e2,
char const* const aln_start, char const* const aln_end,
- float & fwd_score, float& bwd_score) const
+ float const alpha, float & fwd_score, float& bwd_score) const
{
vector<float> p1(e1,0), p2(e2,0);
vector<int> c1(e1,0), c2(e2,0);
@@ -125,9 +148,9 @@ namespace ugdiss
{
x = binread(binread(x,i1),i2);
if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
- p1[i1] += plup_fwd(snt1[i1].id(), snt2[i2].id());
+ p1[i1] += plup_fwd(snt1[i1].id(), snt2[i2].id(),alpha);
++c1[i1];
- p2[i2] += plup_bwd(snt1[i1].id(), snt2[i2].id());
+ p2[i2] += plup_bwd(snt1[i1].id(), snt2[i2].id(),alpha);
++c2[i2];
}
fwd_score = 0;
@@ -135,14 +158,14 @@ namespace ugdiss
{
if (c1[i] == 1) fwd_score += log(p1[i]);
else if (c1[i]) fwd_score += log(p1[i])-log(c1[i]);
- else fwd_score += log(plup_fwd(snt1[i].id(),0));
+ else fwd_score += log(plup_fwd(snt1[i].id(),0,alpha));
}
bwd_score = 0;
for (size_t i = s2; i < e2; ++i)
{
if (c2[i] == 1) bwd_score += log(p2[i]);
else if (c2[i]) bwd_score += log(p2[i])-log(c2[i]);
- else bwd_score += log(plup_bwd(0,snt2[i].id()));
+ else bwd_score += log(plup_bwd(0,snt2[i].id(),alpha));
}
}
}
diff --git a/moses/TranslationModel/UG/mm/ug_lru_cache.h b/moses/TranslationModel/UG/mm/ug_lru_cache.h
new file mode 100644
index 000000000..d1c9a9767
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_lru_cache.h
@@ -0,0 +1,106 @@
+//-*- c++ -*-
+#pragma once
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <boost/shared_ptr.hpp>
+#include <boost/thread.hpp>
+#include <sys/time.h>
+
+
+#ifndef sptr
+#define sptr boost::shared_ptr
+#endif
+
+namespace lru_cache
+{
+ using namespace std;
+ using namespace boost;
+
+ template<typename KEY, typename VAL>
+ class LRU_Cache
+ {
+ public:
+ typedef unordered_map<KEY,uint32_t> map_t;
+ private:
+ struct Record
+ {
+ uint32_t prev,next;
+ KEY key;
+ // timeval tstamp; // time stamp
+ typename boost::shared_ptr<VAL> ptr; // cached shared ptr
+ };
+
+ mutable boost::shared_mutex m_lock;
+ uint32_t m_qfront, m_qback;
+ vector<Record> m_recs;
+ map_t m_idx;
+
+ void
+ update_queue(KEY const& key, uint32_t const p)
+ {
+ // CALLER MUST LOCK!
+ // "remove" item in slot p from it's current position of the
+ // queue (which is different from the slot position) and move it
+ // to the end
+ Record& r = m_recs[p];
+ if (m_recs.size() == 1)
+ r.next = r.prev = m_qback = m_qfront = 0;
+
+ if (r.key != key || p == m_qback) return;
+
+ if (m_qfront == p)
+ m_qfront = m_recs[r.next].prev = r.next;
+ else
+ {
+ m_recs[r.prev].next = r.next;
+ m_recs[r.next].prev = r.prev;
+ }
+ r.prev = m_qback;
+ m_recs[r.prev].next = m_qback = r.next = p;
+ }
+
+ public:
+ LRU_Cache(size_t capacity=1) : m_qfront(0), m_qback(0) { reserve(capacity); }
+ size_t capacity() const { return m_recs.capacity(); }
+ void reserve(size_t s) { m_recs.reserve(s); }
+
+ sptr<VAL>
+ get(KEY const& key)
+ {
+ uint32_t p;
+ { // brackets needed for lock scoping
+ boost::shared_lock<boost::shared_mutex> rlock(m_lock);
+ typename map_t::const_iterator i = m_idx.find(key);
+ if (i == m_idx.end()) return sptr<VAL>();
+ p = i->second;
+ }
+ boost::lock_guard<boost::shared_mutex> guard(m_lock);
+ update_queue(key,p);
+ return m_recs[p].ptr;
+ }
+
+ void
+ set(KEY const& key, sptr<VAL> const& ptr)
+ {
+ boost::lock_guard<boost::shared_mutex> lock(m_lock);
+ pair<typename map_t::iterator,bool> foo;
+ foo = m_idx.insert(make_pair(key,m_recs.size()));
+
+ uint32_t p = foo.first->second;
+ if (foo.second) // was not in the cache
+ {
+ if (m_recs.size() < m_recs.capacity())
+ m_recs.push_back(Record());
+ else
+ {
+ foo.first->second = p = m_qfront;
+ m_idx.erase(m_recs[p].key);
+ }
+ m_recs[p].key = key;
+ }
+ update_queue(key,p);
+ m_recs[p].ptr = ptr;
+ }
+ };
+}
diff --git a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
index 6f1abed9e..cfc86b8fc 100644
--- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
@@ -52,12 +52,13 @@ namespace ugdiss
VAL operator[](ID key) const;
};
- Cell* data;
- VAL *M1, *M2;
- OFFSET * index;
+ Cell const* data;
+ VAL const* M1;
+ VAL const* M2;
+ OFFSET const* index;
ID numRows;
ID numCols;
- boost::shared_ptr<bio::mapped_file> file;
+ boost::shared_ptr<bio::mapped_file_source> file;
VAL m1(ID key) const
{
@@ -120,8 +121,8 @@ namespace ugdiss
string foo = msg.str();
UTIL_THROW(util::Exception,foo.c_str());
}
- file.reset(new bio::mapped_file());
- file->open(fname,ios::in|ios::out);
+ file.reset(new bio::mapped_file_source());
+ file->open(fname);
if (!file->is_open())
{
ostringstream msg;
@@ -130,14 +131,14 @@ namespace ugdiss
string foo = msg.str();
UTIL_THROW(util::Exception,foo.c_str());
}
- char* p = file->data();
- filepos_type offset = *reinterpret_cast<filepos_type*>(p);
- index = reinterpret_cast<OFFSET*>(p+offset); p += sizeof(offset);
+ char const* p = file->data();
+ filepos_type offset = *reinterpret_cast<filepos_type const*>(p);
+ index = reinterpret_cast<OFFSET const*>(p+offset); p += sizeof(offset);
numRows = *reinterpret_cast<ID const*>(p); p += sizeof(id_type);
numCols = *reinterpret_cast<ID const*>(p); p += sizeof(id_type);
- data = reinterpret_cast<Cell*>(p);
+ data = reinterpret_cast<Cell const*>(p);
// cout << numRows << " rows; " << numCols << " columns " << endl;
- M1 = reinterpret_cast<VAL*>(index+numRows+1);
+ M1 = reinterpret_cast<VAL const*>(index+numRows+1);
M2 = M1+numRows;
// cout << "Table " << fname << " has " << numRows << " rows and "
// << numCols << " columns." << endl;
diff --git a/moses/TranslationModel/UG/mm/ug_mm_tsa.h b/moses/TranslationModel/UG/mm/ug_mm_tsa.h
index da6637a0a..9d5038e26 100644
--- a/moses/TranslationModel/UG/mm/ug_mm_tsa.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_tsa.h
@@ -60,13 +60,13 @@ namespace ugdiss
readSid(char const* p, char const* q, id_type& sid) const;
char const*
- readSid(char const* p, char const* q, uint64_t& sid) const;
+ readSid(char const* p, char const* q, ::uint64_t& sid) const;
char const*
readOffset(char const* p, char const* q, uint16_t& offset) const;
char const*
- readOffset(char const* p, char const* q, uint64_t& offset) const;
+ readOffset(char const* p, char const* q, ::uint64_t& offset) const;
void sanityCheck() const;
@@ -188,7 +188,7 @@ namespace ugdiss
template<typename TOKEN>
char const*
mmTSA<TOKEN>::
- readSid(char const* p, char const* q, uint64_t& sid) const
+ readSid(char const* p, char const* q, ::uint64_t& sid) const
{
return tightread(p,q,sid);
}
@@ -210,7 +210,7 @@ namespace ugdiss
inline
char const*
mmTSA<TOKEN>::
- readOffset(char const* p, char const* q, uint64_t& offset) const
+ readOffset(char const* p, char const* q, ::uint64_t& offset) const
{
return tightread(p,q,offset);
}
@@ -243,7 +243,7 @@ namespace ugdiss
{
raw = 0;
id_type sid; uint16_t off;
- boost::dynamic_bitset<uint64_t> check(this->corpus->size());
+ boost::dynamic_bitset<typename ::uint64_t> check(this->corpus->size());
while (p < q)
{
p = tightread(p,q,sid);
diff --git a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
index 2be6e6de5..51ba21778 100644
--- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
@@ -93,7 +93,7 @@ namespace ugdiss
assert(myfile.is_open());
Moses::prime(myfile);
filepos_type idxOffset;
- char* p = myfile.data();
+ const char* p = myfile.data();
id_type numSent,numWords;
p = numread(p,idxOffset);
p = numread(p,numSent);
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.cc b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
new file mode 100644
index 000000000..6373f8468
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
@@ -0,0 +1,97 @@
+#include "ug_phrasepair.h"
+namespace Moses {
+ namespace bitext
+ {
+
+#if 0
+ void
+ PhrasePair::
+ init()
+ {
+ p1 = p2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ }
+
+ void
+ PhrasePair::
+ init(uint64_t const pid1,
+ pstats const& ps1,
+ pstats const& ps2,
+ size_t const numfeats)
+ {
+ p1 = pid1;
+ raw1 = ps1.raw_cnt + ps2.raw_cnt;
+ sample1 = ps1.sample_cnt + ps2.sample_cnt;
+ sample2 = 0;
+ good1 = ps1.good + ps2.good;
+ good2 = 0;
+ joint = 0;
+ fvals.resize(numfeats);
+ }
+
+ PhrasePair const&
+ PhrasePair::
+ update(uint64_t const pid2, jstats const& js1, jstats const& js2)
+ {
+ p2 = pid2;
+ raw2 = js1.cnt2() + js2.cnt2();
+ joint = js1.rcnt() + js2.rcnt();
+ assert(js1.aln().size() || js2.aln().size());
+ if (js1.aln().size())
+ aln = js1.aln()[0].second;
+ else if (js2.aln().size())
+ aln = js2.aln()[0].second;
+ for (int i = po_first; i < po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
+ dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
+ }
+ return *this;
+ }
+
+ PhrasePair const&
+ PhrasePair::
+ update(uint64_t const pid2, size_t r2)
+ {
+ p2 = pid2;
+ raw2 = r2;
+ joint = 0;
+ return *this;
+ }
+
+
+ PhrasePair const&
+ PhrasePair::
+ update(uint64_t const pid2,
+ size_t const raw2extra,
+ jstats const& js)
+ {
+ p2 = pid2;
+ raw2 = js.cnt2() + raw2extra;
+ joint = js.rcnt();
+ assert(js.aln().size());
+ if (js.aln().size())
+ aln = js.aln()[0].second;
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
+ dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
+ }
+ return *this;
+ }
+
+ float
+ PhrasePair::
+ eval(vector<float> const& w)
+ {
+ assert(w.size() == this->fvals.size());
+ this->score = 0;
+ for (size_t i = 0; i < w.size(); ++i)
+ this->score += w[i] * this->fvals[i];
+ return this->score;
+ }
+#endif
+ } // namespace bitext
+} // namespace Moses
+
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
new file mode 100644
index 000000000..a966d00dc
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -0,0 +1,13 @@
+//-*- c++ -*-
+#pragma once
+
+// using namespace ugdiss;
+// using namespace std;
+
+// namespace Moses {
+// namespace bitext
+// {
+
+
+// } // namespace bitext
+// } // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_base.h b/moses/TranslationModel/UG/mm/ug_tsa_base.h
index a6291ac3c..83593c79c 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_base.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_base.h
@@ -53,7 +53,7 @@ namespace ugdiss
/* an entry in the array, for iteration over all occurrences of a
* particular sequence */
// typedef boost::dynamic_bitset<uint64_t> bitset;
- typedef shared_ptr<bitvector> bitset_pointer;
+ typedef boost::shared_ptr<bitvector> bitset_pointer;
typedef TKN Token;
typedef BitSetCache<TSA<TKN> > BSC_t;
/* to allow caching of bit vectors that are expensive to create on
@@ -62,7 +62,7 @@ namespace ugdiss
friend class TSA_tree_iterator<TKN>;
protected:
- shared_ptr<Ttrack<TKN> const> corpus; // pointer to the underlying corpus
+ boost::shared_ptr<Ttrack<TKN> const> corpus; // pointer to the underlying corpus
char const* startArray; // beginning ...
char const* endArray; // ... and end ...
// of memory block storing the actual TSA
@@ -139,7 +139,7 @@ namespace ugdiss
getUpperBound(id_type id) const = 0;
public:
- shared_ptr<BSC_t> bsc;
+ boost::shared_ptr<BSC_t> bsc;
char const* arrayStart() const { return startArray; }
char const* arrayEnd() const { return endArray; }
@@ -182,7 +182,7 @@ namespace ugdiss
count_type
setBits(char const* startRange, char const* endRange,
- boost::dynamic_bitset<uint64_t>& bs) const;
+ boost::dynamic_bitset<typename ::uint64_t>& bs) const;
void
setTokenBits(char const* startRange, char const* endRange, size_t len,
@@ -201,7 +201,7 @@ namespace ugdiss
virtual
char const*
- readSid(char const* p, char const* q, uint64_t& sid) const = 0;
+ readSid(char const* p, char const* q, ::uint64_t& sid) const = 0;
/** read the offset part of the index entry into /offset/
* @return position of the next entry in the index.
@@ -216,7 +216,7 @@ namespace ugdiss
virtual
char const*
- readOffset(char const* p, char const* q, uint64_t& offset) const = 0;
+ readOffset(char const* p, char const* q, ::uint64_t& offset) const = 0;
/** @return sentence count
*/
@@ -268,26 +268,26 @@ namespace ugdiss
next 16 bits: offset from the start of the sentence
next 16 bits: length of the phrase
*/
- uint64_t
+ ::uint64_t
getSequenceId(typename vector<TKN>::const_iterator const& pstart,
typename vector<TKN>::const_iterator const& pstop) const;
- uint64_t
+ ::uint64_t
getSequenceId(TKN const* t, ushort plen) const;
/** Return the phrase represented by phrase ID pid_ */
string
- getSequence(uint64_t pid, TokenIndex const& V) const;
+ getSequence(::uint64_t pid, TokenIndex const& V) const;
/** Return the phrase represented by phrase ID pid_ */
vector<TKN>
- getSequence(uint64_t pid) const;
+ getSequence(::uint64_t pid) const;
TKN const*
- getSequenceStart(uint64_t) const;
+ getSequenceStart(::uint64_t) const;
ushort
- getSequenceLength(uint64_t) const;
+ getSequenceLength(::uint64_t) const;
size_t
getCorpusSize() const;
@@ -298,7 +298,7 @@ namespace ugdiss
bitset_pointer
getBitSet(TKN const* startKey, size_t keyLen) const;
- shared_ptr<bitvector>
+ boost::shared_ptr<bitvector>
findTree(TKN const* treeStart, TKN const* treeEnd,
bitvector const* filter) const;
@@ -638,7 +638,7 @@ namespace ugdiss
//---------------------------------------------------------------------------
template<typename TKN>
- uint64_t
+ ::uint64_t
TSA<TKN>::
getSequenceId(typename vector<TKN>::const_iterator const& pstart,
typename vector<TKN>::const_iterator const& pstop) const
@@ -649,7 +649,7 @@ namespace ugdiss
//---------------------------------------------------------------------------
template<typename TKN>
- uint64_t
+ ::uint64_t
TSA<TKN>::
getSequenceId(TKN const* pstart, ushort plen) const
{
@@ -657,7 +657,7 @@ namespace ugdiss
if (!p) return 0; // not found!
ArrayEntry I;
readEntry(p,I);
- uint64_t ret = I.sid;
+ ::uint64_t ret = I.sid;
ret <<= 16;
ret += I.offset;
ret <<= 16;
@@ -670,7 +670,7 @@ namespace ugdiss
template<typename TKN>
vector<TKN>
TSA<TKN>::
- getSequence(uint64_t pid) const
+ getSequence(::uint64_t pid) const
{
size_t plen = pid % 65536;
size_t offset = (pid >> 16) % 65536;
@@ -687,7 +687,7 @@ namespace ugdiss
template<typename TKN>
string
TSA<TKN>::
- getSequence(uint64_t pid, TokenIndex const& V) const
+ getSequence(::uint64_t pid, TokenIndex const& V) const
{
ostringstream buf;
TKN const* a = getSequenceStart(pid);
@@ -704,7 +704,7 @@ namespace ugdiss
template<typename TKN>
TKN const*
TSA<TKN>::
- getSequenceStart(uint64_t pid) const
+ getSequenceStart(::uint64_t pid) const
{
size_t offset = (pid >> 16) % 65536;
return corpus->sntStart(pid >> 32)+offset;
@@ -715,7 +715,7 @@ namespace ugdiss
template<typename TKN>
ushort
TSA<TKN>::
- getSequenceLength(uint64_t pid) const
+ getSequenceLength(::uint64_t pid) const
{
return (pid % 65536);
}
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h
index 46ce2f703..3111f1c1d 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h
@@ -24,7 +24,7 @@ namespace ugdiss
BitSetCache
{
public:
- typedef boost::dynamic_bitset<uint64_t> BitSet;
+ typedef boost::dynamic_bitset<typename ::uint64_t> BitSet;
typedef boost::shared_ptr<BitSet> bsptr;
typedef map<pair<char const*,ushort>,bsptr> myMap;
typedef myMap::iterator myMapIter;
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
index 14bf6cdad..ac8cbe24e 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@@ -7,6 +7,8 @@
#include "ug_typedefs.h"
#include "tpt_tokenindex.h"
#include <iostream>
+#include "util/exception.hh"
+#include "moses/Util.h"
//#include <cassert>
// #include "ug_bv_iter.h"
@@ -60,10 +62,15 @@ namespace ugdiss
// TSA_tree_iterator(TSA_tree_iterator const& other);
TSA_tree_iterator(TSA<Token> const* s);
+ TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other);
TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
// TSA_tree_iterator(TSA<Token> const* s, Token const& t);
TSA_tree_iterator(TSA<Token> const* s,
Token const* kstart,
+ size_t const len,
+ bool full_match_only=true);
+ TSA_tree_iterator(TSA<Token> const* s,
+ Token const* kstart,
Token const* kend,
bool full_match_only=true);
// TSA_tree_iterator(TSA<Token> const* s,
@@ -80,7 +87,7 @@ namespace ugdiss
ushort getOffset(int p) const;
size_t sntCnt(int p=-1) const;
size_t rawCnt(int p=-1) const;
- uint64_t getPid(int p=-1) const; // get phrase id
+ ::uint64_t getPid(int p=-1) const; // get phrase id
virtual bool extend(Token const& id);
virtual bool extend(id_type id);
@@ -97,25 +104,25 @@ namespace ugdiss
// fillBitSet: deprecated; use markSentences() instead
count_type
- fillBitSet(boost::dynamic_bitset<uint64_t>& bitset) const;
+ fillBitSet(boost::dynamic_bitset<typename ::uint64_t>& bitset) const;
count_type
markEndOfSequence(Token const* start, Token const* stop,
- boost::dynamic_bitset<uint64_t>& dest) const;
+ boost::dynamic_bitset<typename ::uint64_t>& dest) const;
count_type
markSequence(Token const* start, Token const* stop, bitvector& dest) const;
count_type
- markSentences(boost::dynamic_bitset<uint64_t>& bitset) const;
+ markSentences(boost::dynamic_bitset<typename ::uint64_t>& bitset) const;
count_type
- markOccurrences(boost::dynamic_bitset<uint64_t>& bitset,
+ markOccurrences(boost::dynamic_bitset<typename ::uint64_t>& bitset,
bool markOnlyStartPosition=false) const;
count_type
markOccurrences(vector<ushort>& dest) const;
- uint64_t
+ ::uint64_t
getSequenceId() const;
// equivalent but more efficient than
@@ -150,9 +157,12 @@ namespace ugdiss
double approxOccurrenceCount(int p=-1) const
{
assert(root);
+ if (p < 0) p += lower.size();
double ret = arrayByteSpanSize(p)/root->aveIndexEntrySize();
- assert(ret < root->corpus->numTokens());
if (ret < 25) ret = rawCnt(p);
+ UTIL_THROW_IF2(ret > root->corpus->numTokens(), "[" << HERE << "] "
+ << "Word count mismatch.");
+ assert(ret <= root->corpus->numTokens());
return ret;
}
@@ -320,6 +330,18 @@ namespace ugdiss
template<typename Token>
TSA_tree_iterator<Token>::
+ TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other)
+ : root(s)
+ {
+ Token const* x = other.getToken(0);
+ for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i)
+ x = x->next();
+ };
+
+
+
+ template<typename Token>
+ TSA_tree_iterator<Token>::
TSA_tree_iterator
(TSA<Token> const* r,
id_type const* s,
@@ -385,6 +407,25 @@ namespace ugdiss
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
+ size_t const len, bool full_match_only)
+ : root(s)
+ {
+ if (!root) return;
+ size_t i = 0;
+ for (; i < len && kstart && extend(*kstart); ++i)
+ kstart = kstart->next();
+ if (full_match_only && i != len)
+ {
+ lower.clear();
+ upper.clear();
+ }
+ };
+
+ // DEPRECATED: DO NOT USE. Use the one that takes the length
+ // instead of kend.
+ template<typename Token>
+ TSA_tree_iterator<Token>::
+ TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
Token const* kend, bool full_match_only)
: root(s)
{
@@ -474,7 +515,7 @@ namespace ugdiss
// ---------------------------------------------------------------------------
template<typename Token>
- uint64_t
+ ::uint64_t
TSA_tree_iterator<Token>::
getPid(int p) const
{
@@ -482,9 +523,9 @@ namespace ugdiss
if (p < 0) p += upper.size();
char const* lb = lower_bound(p);
char const* ub = upper_bound(p);
- uint64_t sid,off;
+ ::uint64_t sid,off;
root->readOffset(root->readSid(lb,ub,sid),ub,off);
- uint64_t ret = (sid<<32) + (off<<16) + uint64_t(p+1);
+ ::uint64_t ret = (sid<<32) + (off<<16) + ::uint64_t(p+1);
return ret;
}
@@ -561,8 +602,7 @@ namespace ugdiss
TSA_tree_iterator<Token>::
rawCnt(int p) const
{
- if (p < 0)
- p = lower.size()+p;
+ if (p < 0) p += lower.size();
assert(p>=0);
if (lower.size() == 0) return root->getCorpusSize();
return root->rawCnt(lower[p],upper[p]);
@@ -573,7 +613,7 @@ namespace ugdiss
template<typename Token>
count_type
TSA_tree_iterator<Token>::
- fillBitSet(boost::dynamic_bitset<uint64_t>& bitset) const
+ fillBitSet(boost::dynamic_bitset<typename ::uint64_t>& bitset) const
{
return markSentences(bitset);
}
@@ -583,7 +623,7 @@ namespace ugdiss
template<typename Token>
count_type
TSA_tree_iterator<Token>::
- markSentences(boost::dynamic_bitset<uint64_t>& bitset) const
+ markSentences(boost::dynamic_bitset<typename ::uint64_t>& bitset) const
{
assert(root && root->corpus);
bitset.resize(root->corpus->size());
@@ -610,7 +650,7 @@ namespace ugdiss
template<typename Token>
count_type
TSA_tree_iterator<Token>::
- markOccurrences(boost::dynamic_bitset<uint64_t>& bitset, bool markOnlyStartPosition) const
+ markOccurrences(boost::dynamic_bitset<typename ::uint64_t>& bitset, bool markOnlyStartPosition) const
{
assert(root && root->corpus);
if (bitset.size() != root->corpus->numTokens())
@@ -657,7 +697,7 @@ namespace ugdiss
count_type
TSA_tree_iterator<Token>::
markEndOfSequence(Token const* start, Token const* stop,
- boost::dynamic_bitset<uint64_t>& dest) const
+ boost::dynamic_bitset<typename ::uint64_t>& dest) const
{
count_type matchCount=0;
Token const* a = getToken(0);
@@ -714,7 +754,7 @@ namespace ugdiss
//---------------------------------------------------------------------------
template<typename Token>
- uint64_t
+ ::uint64_t
TSA_tree_iterator<Token>::
getSequenceId() const
{
@@ -722,7 +762,7 @@ namespace ugdiss
char const* p = this->lower_bound(-1);
typename Token::ArrayEntry I;
root->readEntry(p,I);
- return (uint64_t(I.sid)<<32)+(I.offset<<16)+this->size();
+ return (::uint64_t(I.sid)<<32)+(I.offset<<16)+this->size();
}
template<typename Token>
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index 128dcfe80..5b52161ca 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -1,14 +1,18 @@
#include "mmsapt.h"
#include <boost/foreach.hpp>
+#include <boost/scoped_ptr.hpp>
#include <boost/tokenizer.hpp>
#include <algorithm>
+#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
+#include "util/exception.hh"
+#include <set>
namespace Moses
{
using namespace bitext;
using namespace std;
using namespace boost;
-
+
void
fillIdSeq(Phrase const& mophrase, size_t const ifactor,
TokenIndex const& V, vector<id_type>& dest)
@@ -23,7 +27,7 @@ namespace Moses
void
- parseLine(string const& line, map<string,string> & params)
+ parseLine(string const& line, map<string,string> & param)
{
char_separator<char> sep("; ");
tokenizer<char_separator<char> > tokens(line,sep);
@@ -32,9 +36,14 @@ namespace Moses
size_t i = t.find_first_not_of(" =");
size_t j = t.find_first_of(" =",i+1);
size_t k = t.find_first_not_of(" =",j+1);
+ UTIL_THROW_IF2(i == string::npos || k == string::npos,
+ "[" << HERE << "] "
+ << "Parameter specification error near '"
+ << t << "' in moses ini line\n"
+ << line);
assert(i != string::npos);
assert(k != string::npos);
- params[t.substr(i,j)] = t.substr(k);
+ param[t.substr(i,j)] = t.substr(k);
}
}
@@ -47,15 +56,18 @@ namespace Moses
}
#endif
+ vector<string> const&
+ Mmsapt::
+ GetFeatureNames() const
+ {
+ return m_feature_names;
+ }
+
Mmsapt::
Mmsapt(string const& line)
- // : PhraseDictionary("Mmsapt",line), ofactor(1,0)
: PhraseDictionary(line)
- , withLogCountFeatures(false)
- , withPfwd(true), withPbwd(true)
, ofactor(1,0)
, m_tpc_ctr(0)
- // default values chosen for bwd probability
{
this->init(line);
}
@@ -84,74 +96,167 @@ namespace Moses
void
Mmsapt::
+ register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry)
+ {
+ registry.push_back(ff);
+ ff->setIndex(m_feature_names.size());
+ for (int i = 0; i < ff->fcnt(); ++i)
+ {
+ m_feature_names.push_back(ff->fname(i));
+ m_is_logval.push_back(ff->isLogVal(i));
+ m_is_integer.push_back(ff->isIntegerValued(i));
+ }
+ }
+
+ bool
+ Mmsapt::
+ isLogVal(int i) const { return m_is_logval.at(i); }
+
+ bool
+ Mmsapt::
+ isInteger(int i) const { return m_is_integer.at(i); }
+
+ void
+ Mmsapt::
init(string const& line)
{
map<string,string>::const_iterator m;
- map<string,string> param;
- parseLine(line,param);
+ parseLine(line,this->param);
+
+ this->m_numScoreComponents = atoi(param["num-features"].c_str());
m = param.find("config");
if (m != param.end())
read_config_file(m->second,param);
-
- bname = param["base"];
- L1 = param["L1"];
- L2 = param["L2"];
- assert(bname.size());
- assert(L1.size());
- assert(L2.size());
-
- m = param.find("pfwd_denom");
- m_pfwd_denom = m != param.end() ? m->second[0] : 's';
-
- m = param.find("smooth");
- m_lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05;
-
- m = param.find("max-samples");
- m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
- m = param.find("logcnt-features");
+ m = param.find("base");
if (m != param.end())
- withLogCountFeatures = m->second != "0";
+ {
+ bname = m->second;
+ m = param.find("path");
+ UTIL_THROW_IF2((m != param.end() && m->second != bname),
+ "Conflicting aliases for path:\n"
+ << "path=" << string(m->second) << "\n"
+ << "base=" << bname.c_str() );
+ }
+ else bname = param["path"];
+ L1 = param["L1"];
+ L2 = param["L2"];
+
+ UTIL_THROW_IF2(bname.size() == 0, "Missing corpus base name at " << HERE);
+ UTIL_THROW_IF2(L1.size() == 0, "Missing L1 tag at " << HERE);
+ UTIL_THROW_IF2(L2.size() == 0, "Missing L2 tag at " << HERE);
+
+ // set defaults for all parameters if not specified so far
+ pair<string,string> dflt("input-factor","0");
+ input_factor = atoi(param.insert(dflt).first->second.c_str());
+ // shouldn't that be a string?
+
+ dflt = pair<string,string> ("smooth",".01");
+ m_lbop_conf = atof(param.insert(dflt).first->second.c_str());
- m = param.find("pfwd");
- if (m != param.end())
- withPfwd = m->second != "0";
+ dflt = pair<string,string> ("lexalpha","0");
+ m_lex_alpha = atof(param.insert(dflt).first->second.c_str());
- m = param.find("pbwd");
- if (m != param.end())
- withPbwd = m->second != "0";
-
- m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
+ dflt = pair<string,string> ("sample","1000");
+ m_default_sample_size = atoi(param.insert(dflt).first->second.c_str());
- m = param.find("workers");
- m_workers = m != param.end() ? atoi(m->second.c_str()) : 8;
+ dflt = pair<string,string>("workers","8");
+ m_workers = atoi(param.insert(dflt).first->second.c_str());
m_workers = min(m_workers,24UL);
- m = param.find("cache-size");
- m_history.reserve(m != param.end()
- ? max(1000,atoi(m->second.c_str()))
- : 10000);
- this->m_numScoreComponents = atoi(param["num-features"].c_str());
-
- // num_features = 0;
- m = param.find("ifactor");
- input_factor = m != param.end() ? atoi(m->second.c_str()) : 0;
+ dflt = pair<string,string>("table-limit","20");
+ m_tableLimit = atoi(param.insert(dflt).first->second.c_str());
+
+ dflt = pair<string,string>("cache","10000");
+ size_t hsize = max(1000,atoi(param.insert(dflt).first->second.c_str()));
+ m_history.reserve(hsize);
+ // in plain language: cache size is at least 1000, and 10,000 by default
+ // this cache keeps track of the most frequently used target phrase collections
+ // even when not actively in use
+
+ // Feature functions are initialized in function Load();
+ param.insert(pair<string,string>("pfwd", "g"));
+ param.insert(pair<string,string>("pbwd", "g"));
+ param.insert(pair<string,string>("logcnt", "0"));
+ param.insert(pair<string,string>("coh", "0"));
+ param.insert(pair<string,string>("rare", "1"));
+ param.insert(pair<string,string>("prov", "1"));
+
poolCounts = true;
- m = param.find("extra");
- if (m != param.end())
+
+ if ((m = param.find("bias")) != param.end())
+ bias_file = m->second;
+
+ if ((m = param.find("extra")) != param.end())
+ extra_data = m->second;
+
+ dflt = pair<string,string>("tuneable","true");
+ m_tuneable = Scan<bool>(param.insert(dflt).first->second.c_str());
+
+ dflt = pair<string,string>("feature-sets","standard");
+ m_feature_set_names = Tokenize(param.insert(dflt).first->second.c_str(), ",");
+ m = param.find("name");
+ if (m != param.end()) m_name = m->second;
+
+ // check for unknown parameters
+ vector<string> known_parameters; known_parameters.reserve(50);
+ known_parameters.push_back("L1");
+ known_parameters.push_back("L2");
+ known_parameters.push_back("Mmsapt");
+ known_parameters.push_back("PhraseDictionaryBitextSampling"); // alias for Mmsapt
+ known_parameters.push_back("base"); // alias for path
+ known_parameters.push_back("bias");
+ known_parameters.push_back("cache");
+ known_parameters.push_back("coh");
+ known_parameters.push_back("config");
+ known_parameters.push_back("extra");
+ known_parameters.push_back("feature-sets");
+ known_parameters.push_back("input-factor");
+ known_parameters.push_back("lexalpha");
+ // known_parameters.push_back("limit"); // replaced by "table-limit"
+ known_parameters.push_back("logcnt");
+ known_parameters.push_back("name");
+ known_parameters.push_back("num-features");
+ known_parameters.push_back("output-factor");
+ known_parameters.push_back("path");
+ known_parameters.push_back("pbwd");
+ known_parameters.push_back("pfwd");
+ known_parameters.push_back("prov");
+ known_parameters.push_back("rare");
+ known_parameters.push_back("sample");
+ known_parameters.push_back("smooth");
+ known_parameters.push_back("table-limit");
+ known_parameters.push_back("tuneable");
+ known_parameters.push_back("unal");
+ known_parameters.push_back("workers");
+ sort(known_parameters.begin(),known_parameters.end());
+ for (map<string,string>::iterator m = param.begin(); m != param.end(); ++m)
{
- extra_data = m->second;
- // cerr << "have extra data" << endl;
+ UTIL_THROW_IF2(!binary_search(known_parameters.begin(),
+ known_parameters.end(), m->first),
+ HERE << ": Unknown parameter specification for Mmsapt: "
+ << m->first);
}
- // keeps track of the most frequently used target phrase collections
- // (to keep them cached even when not actively in use)
+ }
+
+ void
+ Mmsapt::
+ load_bias(string const fname)
+ {
+ ifstream in(fname.c_str());
+ bias.reserve(btfix.T1->size());
+ float v;
+ while (in>>v) bias.push_back(v);
+ UTIL_THROW_IF2(bias.size() != btfix.T1->size(),
+ "Mismatch between bias vector size and corpus size at "
+ << HERE);
}
void
Mmsapt::
- load_extra_data(string bname)
+ load_extra_data(string bname, bool locking = true)
{
// TO DO: ADD CHECKS FOR ROBUSTNESS
// - file existence?
@@ -169,59 +274,153 @@ namespace Moses
while(getline(in2,line)) text2.push_back(line);
while(getline(ina,line)) symal.push_back(line);
- lock_guard<mutex> guard(this->lock);
+ boost::scoped_ptr<boost::lock_guard<boost::mutex> > guard;
+ if (locking) guard.reset(new boost::lock_guard<boost::mutex>(this->lock));
btdyn = btdyn->add(text1,text2,symal);
assert(btdyn);
- // cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl;
+ cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl;
}
+ template<typename fftype>
void
Mmsapt::
- Load()
+ check_ff(string const ffname, vector<sptr<pscorer> >* registry)
{
- btfix.num_workers = this->m_workers;
- btfix.open(bname, L1, L2);
- btfix.setDefaultSampleSize(m_default_sample_size);
-
- size_t num_feats = 0;
- // TO DO: should we use different lbop parameters
- // for the relative-frequency based features?
-
- if (withLogCountFeatures) num_feats = add_logcounts_fix.init(num_feats);
+ string const& spec = param[ffname];
+ if (spec == "" || spec == "0") return;
+ if (registry)
+ {
+ sptr<fftype> ff(new fftype(spec));
+ register_ff(ff, *registry);
+ }
+ else if (spec[spec.size()-1] == '+') // corpus specific
+ {
+ sptr<fftype> ff(new fftype(spec));
+ register_ff(ff, m_active_ff_fix);
+ ff.reset(new fftype(spec));
+ register_ff(ff, m_active_ff_dyn);
+ }
+ else
+ {
+ sptr<fftype> ff(new fftype(spec));
+ register_ff(ff, m_active_ff_common);
+ }
+ }
- float const lbop = m_lbop_parameter; // just for code readability below
- if (withPfwd) num_feats = calc_pfwd_fix.init(num_feats,lbop,m_pfwd_denom);
- if (withPbwd) num_feats = calc_pbwd_fix.init(num_feats,lbop);
-
- // currently always active by default; may (should) change later
- num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
+ template<typename fftype>
+ void
+ Mmsapt::
+ check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry)
+ {
+ string const& spec = param[ffname];
+ if (spec == "" || spec == "0") return;
+ if (registry)
+ {
+ sptr<fftype> ff(new fftype(xtra,spec));
+ register_ff(ff, *registry);
+ }
+ else if (spec[spec.size()-1] == '+') // corpus specific
+ {
+ sptr<fftype> ff(new fftype(xtra,spec));
+ register_ff(ff, m_active_ff_fix);
+ ff.reset(new fftype(xtra,spec));
+ register_ff(ff, m_active_ff_dyn);
+ }
+ else
+ {
+ sptr<fftype> ff(new fftype(xtra,spec));
+ register_ff(ff, m_active_ff_common);
+ }
+ }
- if (this->m_numScoreComponents%2) // a bit of a hack, for backwards compatibility
- num_feats = apply_pp.init(num_feats);
+ // void
+ // Mmsapt::
+ // add_corpus_specific_features(vector<sptr<pscorer > >& registry)
+ // {
+ // check_ff<PScorePbwd<Token> >("pbwd",m_lbop_conf,registry);
+ // check_ff<PScoreLogCnt<Token> >("logcnt",registry);
+ // }
- if (num_feats < this->m_numScoreComponents)
+ void
+ Mmsapt::
+ Load()
+ {
+ Load(true);
+ }
+
+ void
+ Mmsapt::
+ Load(bool with_checks)
+ {
+ boost::lock_guard<boost::mutex> guard(this->lock);
+
+ // can load only once
+ // UTIL_THROW_IF2(shards.size(),"Mmsapt is already loaded at " << HERE);
+
+ // load feature sets
+ BOOST_FOREACH(string const& fsname, m_feature_set_names)
{
- poolCounts = false;
- if (withLogCountFeatures) num_feats = add_logcounts_dyn.init(num_feats);
- if (withPfwd) num_feats = calc_pfwd_dyn.init(num_feats,lbop,m_pfwd_denom);
- if (withPbwd) num_feats = calc_pbwd_dyn.init(num_feats,lbop);
+ // standard (default) feature set
+ if (fsname == "standard")
+ {
+ // lexical scores
+ string lexfile = bname + L1 + "-" + L2 + ".lex";
+ sptr<PScoreLex1<Token> > ff(new PScoreLex1<Token>(param["lex_alpha"],lexfile));
+ register_ff(ff,m_active_ff_common);
+
+ // these are always computed on pooled data
+ check_ff<PScoreRareness<Token> > ("rare", &m_active_ff_common);
+ check_ff<PScoreUnaligned<Token> >("unal", &m_active_ff_common);
+ check_ff<PScoreCoherence<Token> >("coh", &m_active_ff_common);
+
+ // for these ones either way is possible (specification ends with '+'
+ // if corpus-specific
+ check_ff<PScorePfwd<Token> >("pfwd", m_lbop_conf);
+ check_ff<PScorePbwd<Token> >("pbwd", m_lbop_conf);
+ check_ff<PScoreLogCnt<Token> >("logcnt");
+
+ // These are always corpus-specific
+ check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_fix);
+ check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_dyn);
+ }
+
+ // data source features (copies of phrase and word count specific to
+ // this translation model)
+ else if (fsname == "datasource")
+ {
+ sptr<PScorePC<Token> > ffpcnt(new PScorePC<Token>("pcnt"));
+ register_ff(ffpcnt,m_active_ff_common);
+ sptr<PScoreWC<Token> > ffwcnt(new PScoreWC<Token>("wcnt"));
+ register_ff(ffwcnt,m_active_ff_common);
+ }
}
+ // cerr << "Features: " << Join("|",m_feature_names) << endl;
- if (num_feats != this->m_numScoreComponents)
+ if (with_checks)
{
- ostringstream buf;
- buf << "At " << __FILE__ << ":" << __LINE__
- << ": number of feature values provided by Phrase table"
- << " does not match number specified in Moses config file!";
- throw buf.str().c_str();
+ UTIL_THROW_IF2(this->m_feature_names.size() != this->m_numScoreComponents,
+ "At " << HERE << ": number of feature values provided by "
+ << "Phrase table (" << this->m_feature_names.size()
+ << ") does not match number specified in Moses config file ("
+ << this->m_numScoreComponents << ")!\n";);
}
- // cerr << "MMSAPT provides " << num_feats << " features at "
- // << __FILE__ << ":" << __LINE__ << endl;
-
- btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2,m_default_sample_size));
+ // Load corpora. For the time being, we can have one memory-mapped static
+ // corpus and one in-memory dynamic corpus
+ // sptr<mmbitext> btfix(new mmbitext());
+ btfix.num_workers = this->m_workers;
+ btfix.open(bname, L1, L2);
+ btfix.setDefaultSampleSize(m_default_sample_size);
+ // shards.push_back(btfix);
+
+ btdyn.reset(new imbitext(btfix.V1, btfix.V2, m_default_sample_size));
btdyn->num_workers = this->m_workers;
- if (extra_data.size()) load_extra_data(extra_data);
-
+ if (bias_file.size())
+ load_bias(bias_file);
+
+ if (extra_data.size())
+ load_extra_data(extra_data,false);
+
+#if 0
// currently not used
LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
typedef LexicalPhraseScorer2<Token>::table_t::Cell cell_t;
@@ -230,7 +429,9 @@ namespace Moses
for (cell_t const* c = COOC[r].start; c < COOC[r].stop; ++c)
wlex21[c->id].push_back(r);
COOCraw.open(bname + L1 + "-" + L2 + ".coc");
-
+#endif
+ assert(btdyn);
+ // cerr << "LOADED " << HERE << endl;
}
void
@@ -247,337 +448,75 @@ namespace Moses
TargetPhrase*
Mmsapt::
- createTargetPhrase(Phrase const& src,
- Bitext<Token> const& bt,
- PhrasePair const& pp) const
+ mkTPhrase(Phrase const& src,
+ PhrasePair<Token>* fix,
+ PhrasePair<Token>* dyn,
+ sptr<Bitext<Token> > const& dynbt) const
{
- Word w; uint32_t sid,off,len;
- TargetPhrase* tp = new TargetPhrase();
- parse_pid(pp.p2, sid, off, len);
- Token const* x = bt.T2->sntStart(sid) + off;
- for (uint32_t k = 0; k < len; ++k)
+ UTIL_THROW_IF2(!fix && !dyn, HERE <<
+ ": Can't create target phrase from nothing.");
+ vector<float> fvals(this->m_numScoreComponents);
+ PhrasePair<Token> pool = fix ? *fix : *dyn;
+ if (fix)
{
- // cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl;
- StringPiece wrd = (*bt.V2)[x[k].id()];
- // if ((off+len) > bt.T2->sntLen(sid))
- // cerr << off << ";" << len << " " << bt.T2->sntLen(sid) << endl;
- assert(off+len <= bt.T2->sntLen(sid));
- w.CreateFromString(Output,ofactor,wrd,false);
- tp->AddWord(w);
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+ (*ff)(btfix, *fix, &fvals);
}
- tp->GetScoreBreakdown().Assign(this, pp.fvals);
- tp->Evaluate(src);
- return tp;
- }
-
- // process phrase stats from a single parallel corpus
- void
- Mmsapt::
- process_pstats
- (Phrase const& src,
- uint64_t const pid1,
- pstats const& stats,
- Bitext<Token> const & bt,
- TargetPhraseCollection* tpcoll
- ) const
- {
- PhrasePair pp;
- pp.init(pid1, stats, this->m_numScoreComponents);
- if (this->m_numScoreComponents%2)
- apply_pp(bt,pp);
- pstats::trg_map_t::const_iterator t;
- for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
+ if (dyn)
{
- pp.update(t->first,t->second);
- calc_lex(bt,pp);
- if (withPfwd) calc_pfwd_fix(bt,pp);
- if (withPbwd) calc_pbwd_fix(bt,pp);
- if (withLogCountFeatures) add_logcounts_fix(bt,pp);
- tpcoll->Add(createTargetPhrase(src,bt,pp));
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+ (*ff)(*dynbt, *dyn, &fvals);
}
- }
-
- // process phrase stats from a single parallel corpus
- bool
- Mmsapt::
- pool_pstats(Phrase const& src,
- uint64_t const pid1a,
- pstats * statsa,
- Bitext<Token> const & bta,
- uint64_t const pid1b,
- pstats const* statsb,
- Bitext<Token> const & btb,
- TargetPhraseCollection* tpcoll) const
- {
- PhrasePair pp;
- if (statsa && statsb)
- pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
- else if (statsa)
- pp.init(pid1a, *statsa, this->m_numScoreComponents);
- else if (statsb)
- pp.init(pid1b, *statsb, this->m_numScoreComponents);
- else return false; // throw "no stats for pooling available!";
-
- if (this->m_numScoreComponents%2)
- apply_pp(bta,pp);
- pstats::trg_map_t::const_iterator b;
- pstats::trg_map_t::iterator a;
- if (statsb)
+
+ if (fix && dyn) { pool += *dyn; }
+ else if (fix)
{
- for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
- {
- uint32_t sid,off,len;
- parse_pid(b->first, sid, off, len);
- Token const* x = bta.T2->sntStart(sid) + off;
- TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
- if (m.size() == len)
- {
- ;
- if (statsa && ((a = statsa->trg.find(m.getPid()))
- != statsa->trg.end()))
- {
- pp.update(b->first,a->second,b->second);
- a->second.invalidate();
- }
- else
- pp.update(b->first,m.approxOccurrenceCount(),
- b->second);
- }
- else pp.update(b->first,b->second);
- calc_lex(btb,pp);
- if (withPfwd) calc_pfwd_fix(btb,pp);
- if (withPbwd) calc_pbwd_fix(btb,pp);
- if (withLogCountFeatures) add_logcounts_fix(btb,pp);
- tpcoll->Add(createTargetPhrase(src,btb,pp));
- }
+ PhrasePair<Token> zilch; zilch.init();
+ TSA<Token>::tree_iterator m(dynbt->I2.get(), fix->start2, fix->len2);
+ if (m.size() == fix->len2)
+ zilch.raw2 = m.approxOccurrenceCount();
+ pool += zilch;
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+ (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
}
- if (!statsa) return statsb != NULL;
- for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
+ else if (dyn)
{
- uint32_t sid,off,len;
- if (!a->second.valid()) continue;
- parse_pid(a->first, sid, off, len);
- if (btb.T2)
- {
- Token const* x = bta.T2->sntStart(sid) + off;
- TSA<Token>::tree_iterator m(btb.I2.get(), x, x+len);
- if (m.size() == len)
- pp.update(a->first,m.approxOccurrenceCount(),a->second);
- else
- pp.update(a->first,a->second);
- }
- else
- pp.update(a->first,a->second);
-
- UTIL_THROW_IF2(pp.raw2 == 0,
- "OOPS"
- << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: "
- << bta.T2->pid2str(bta.V2.get(),pp.p2) << ": "
- << pp.raw1 << " " << pp.sample1 << " "
- << pp.good1 << " " << pp.joint << " "
- << pp.raw2);
-#if 0
- jstats const& j = a->second;
- cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: "
- << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl;
- cerr << j.rcnt() << " " << j.cnt2() << " " << j.wcnt() << endl;
-#endif
- calc_lex(bta,pp);
- if (withPfwd) calc_pfwd_fix(bta,pp);
- if (withPbwd) calc_pbwd_fix(bta,pp);
- if (withLogCountFeatures) add_logcounts_fix(bta,pp);
- tpcoll->Add(createTargetPhrase(src,bta,pp));
+ PhrasePair<Token> zilch; zilch.init();
+ TSA<Token>::tree_iterator m(btfix.I2.get(), dyn->start2, dyn->len2);
+ if (m.size() == dyn->len2)
+ zilch.raw2 = m.approxOccurrenceCount();
+ pool += zilch;
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+ (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
}
- return true;
-}
-
-
- // process phrase stats from a single parallel corpus
- bool
- Mmsapt::
- combine_pstats
- (Phrase const& src,
- uint64_t const pid1a,
- pstats * statsa,
- Bitext<Token> const & bta,
- uint64_t const pid1b,
- pstats const* statsb,
- Bitext<Token> const & btb,
- TargetPhraseCollection* tpcoll
- ) const
- {
- PhrasePair ppfix,ppdyn,pool;
- Word w;
- if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents);
- if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
- pstats::trg_map_t::const_iterator b;
- pstats::trg_map_t::iterator a;
- if (statsb)
+ if (fix)
{
- pool.init(pid1b,*statsb,0);
- if (this->m_numScoreComponents%2)
- apply_pp(btb,ppdyn);
- for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
- {
- ppdyn.update(b->first,b->second);
- if (withPfwd) calc_pfwd_dyn(btb,ppdyn);
- if (withPbwd) calc_pbwd_dyn(btb,ppdyn);
- if (withLogCountFeatures) add_logcounts_dyn(btb,ppdyn);
- calc_lex(btb,ppdyn);
-
- uint32_t sid,off,len;
- parse_pid(b->first, sid, off, len);
- Token const* x = bta.T2->sntStart(sid) + off;
- TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
- if (m.size() && statsa &&
- ((a = statsa->trg.find(m.getPid()))
- != statsa->trg.end()))
- {
- ppfix.update(a->first,a->second);
- if (withPfwd) calc_pfwd_fix(bta,ppfix,&ppdyn.fvals);
- if (withPbwd) calc_pbwd_fix(bta,ppfix,&ppdyn.fvals);
- if (withLogCountFeatures) add_logcounts_fix(bta,ppfix,&ppdyn.fvals);
- a->second.invalidate();
- }
- else
- {
- if (m.size())
- pool.update(b->first,m.approxOccurrenceCount(),
- b->second);
- else
- pool.update(b->first,b->second);
- if (withPfwd) calc_pfwd_fix(btb,pool,&ppdyn.fvals);
- if (withPbwd) calc_pbwd_fix(btb,pool,&ppdyn.fvals);
- if (withLogCountFeatures) add_logcounts_fix(btb,pool,&ppdyn.fvals);
- }
- tpcoll->Add(createTargetPhrase(src,btb,ppdyn));
- }
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+ (*ff)(btfix, pool, &fvals);
+ }
+ else
+ {
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+ (*ff)(*dynbt, pool, &fvals);
}
- if (statsa)
+ TargetPhrase* tp = new TargetPhrase(this);
+ Token const* x = fix ? fix->start2 : dyn->start2;
+ uint32_t len = fix ? fix->len2 : dyn->len2;
+ for (uint32_t k = 0; k < len; ++k, x = x->next())
{
- pool.init(pid1a,*statsa,0);
- if (this->m_numScoreComponents%2)
- apply_pp(bta,ppfix);
- for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
- {
- if (!a->second.valid()) continue; // done above
- ppfix.update(a->first,a->second);
- if (withPfwd) calc_pfwd_fix(bta,ppfix);
- if (withPbwd) calc_pbwd_fix(bta,ppfix);
- if (withLogCountFeatures) add_logcounts_fix(bta,ppfix);
- calc_lex(bta,ppfix);
-
- if (btb.I2)
- {
- uint32_t sid,off,len;
- parse_pid(a->first, sid, off, len);
- Token const* x = bta.T2->sntStart(sid) + off;
- TSA<Token>::tree_iterator m(btb.I2.get(),x,x+len);
- if (m.size())
- pool.update(a->first,m.approxOccurrenceCount(),a->second);
- else
- pool.update(a->first,a->second);
- }
- else pool.update(a->first,a->second);
- if (withPfwd) calc_pfwd_dyn(bta,pool,&ppfix.fvals);
- if (withPbwd) calc_pbwd_dyn(bta,pool,&ppfix.fvals);
- if (withLogCountFeatures) add_logcounts_dyn(bta,pool,&ppfix.fvals);
- }
- if (ppfix.p2)
- tpcoll->Add(createTargetPhrase(src,bta,ppfix));
+ StringPiece wrd = (*(btfix.V2))[x->id()];
+ Word w; w.CreateFromString(Output,ofactor,wrd,false);
+ tp->AddWord(w);
}
- return (statsa || statsb);
+ tp->SetAlignTerm(pool.aln);
+ tp->GetScoreBreakdown().Assign(this, fvals);
+ tp->EvaluateInIsolation(src);
+ return tp;
}
-
- // // phrase statistics combination treating the two knowledge
- // // sources separately with backoff to pooling when only one
- // // of the two knowledge sources contains the phrase pair in
- // // question
- // void
- // Mmsapt::
- // process_pstats(uint64_t const mypid1,
- // uint64_t const otpid1,
- // pstats const& mystats, // my phrase stats
- // pstats const* otstats, // other phrase stats
- // Bitext<Token> const & mybt, // my bitext
- // Bitext<Token> const * otbt, // other bitext
- // PhraseScorer<Token> const& mypfwd,
- // PhraseScorer<Token> const& mypbwd,
- // PhraseScorer<Token> const* otpfwd,
- // PhraseScorer<Token> const* otpbwd,
- // TargetPhraseCollection* tpcoll)
- // {
- // boost::unordered_map<uint64_t,jstats>::const_iterator t;
- // vector<FactorType> ofact(1,0);
- // PhrasePair mypp,otpp,combo;
- // mypp.init(mypid1, mystats, this->m_numScoreComponents);
- // if (otstats)
- // {
- // otpp.init(otpid1, *otstats, 0);
- // combo.init(otpid1, mystats, *otstats, 0);
- // }
- // else combo = mypp;
-
- // for (t = mystats.trg.begin(); t != mystats.trg.end(); ++t)
- // {
- // if (!t->second.valid()) continue;
- // // we dealt with this phrase pair already;
- // // see j->second.invalidate() below;
- // uint32_t sid,off,len; parse_pid(t->first,sid,off,len);
-
- // mypp.update(t->first,t->second);
- // apply_pp(mybt,mypp);
- // calc_lex (mybt,mypp);
- // mypfwd(mybt,mypp);
- // mypbwd(mybt,mypp);
-
- // if (otbt) // it's a dynamic phrase table
- // {
- // assert(otpfwd);
- // assert(otpbwd);
- // boost::unordered_map<uint64_t,jstats>::iterator j;
-
- // // look up the current target phrase in the other bitext
- // Token const* x = mybt.T2->sntStart(sid) + off;
- // TSA<TOKEN>::tree_iterator m(otbt->I2.get(),x,x+len);
- // if (otstats // source phrase exists in other bitext
- // && m.size() // target phrase exists in other bitext
- // && ((j = otstats->trg.find(m.getPid()))
- // != otstats->trg.end())) // phrase pair found in other bitext
- // {
- // otpp.update(j->first,j->second);
- // j->second.invalidate(); // mark the phrase pair as seen
- // otpfwd(*otbt,otpp,&mypp.fvals);
- // otpbwd(*otbt,otpp,&mypp.fvals);
- // }
- // else
- // {
- // if (m.size()) // target phrase seen in other bitext, but not the phrase pair
- // combo.update(t->first,m.approxOccurrenceCount(),t->second);
- // else
- // combo.update(t->first,t->second);
- // (*otpfwd)(mybt,combo,&mypp.fvals);
- // (*otpbwd)(mybt,combo,&mypp.fvals);
- // }
- // }
-
- // // now add the phrase pair to the TargetPhraseCollection:
- // TargetPhrase* tp = new TargetPhrase();
- // for (size_t k = off; k < stop; ++k)
- // {
- // StringPiece wrd = (*mybt.V2)[x[k].id()];
- // Word w; w.CreateFromString(Output,ofact,wrd,false);
- // tp->AddWord(w);
- // }
- // tp->GetScoreBreakdown().Assign(this,mypp.fvals);
- // tp->Evaluate(src);
- // tpcoll->Add(tp);
- // }
- // }
-
+
Mmsapt::
TargetPhraseCollectionWrapper::
- TargetPhraseCollectionWrapper(size_t r, uint64_t k)
+ TargetPhraseCollectionWrapper(size_t r, ::uint64_t k)
: revision(r), key(k), refCount(0), idx(-1)
{ }
@@ -587,9 +526,7 @@ namespace Moses
{
assert(this->refCount == 0);
}
-
-
// This is not the most efficient way of phrase lookup!
TargetPhraseCollection const*
Mmsapt::
@@ -597,28 +534,26 @@ namespace Moses
{
// map from Moses Phrase to internal id sequence
vector<id_type> sphrase;
- fillIdSeq(src,input_factor,*btfix.V1,sphrase);
+ fillIdSeq(src,input_factor,*(btfix.V1),sphrase);
if (sphrase.size() == 0) return NULL;
- // lookup in static bitext
- TSA<Token>::tree_iterator mfix(btfix.I1.get(),&sphrase[0],sphrase.size());
-
- // lookup in dynamic bitext
// Reserve a local copy of the dynamic bitext in its current form. /btdyn/
// is set to a new copy of the dynamic bitext every time a sentence pair
// is added. /dyn/ keeps the old bitext around as long as we need it.
sptr<imBitext<Token> > dyn;
{ // braces are needed for scoping mutex lock guard!
boost::lock_guard<boost::mutex> guard(this->lock);
+ assert(btdyn);
dyn = btdyn;
}
assert(dyn);
+
+ // lookup phrases in both bitexts
+ TSA<Token>::tree_iterator mfix(btfix.I1.get(), &sphrase[0], sphrase.size());
TSA<Token>::tree_iterator mdyn(dyn->I1.get());
if (dyn->I1.get())
- {
- for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
- mdyn.extend(sphrase[i]);
- }
+ for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
+ mdyn.extend(sphrase[i]);
#if 0
cerr << src << endl;
@@ -626,43 +561,72 @@ namespace Moses
<< mdyn.size() << " " << mdyn.getPid() << endl;
#endif
- // phrase not found in either
- if (mdyn.size() != sphrase.size() &&
- mfix.size() != sphrase.size())
- return NULL; // not found
+ if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size())
+ return NULL; // phrase not found in either bitext
// cache lookup:
-
- uint64_t phrasekey;
- if (mfix.size() == sphrase.size())
- phrasekey = (mfix.getPid()<<1);
- else
- phrasekey = (mdyn.getPid()<<1)+1;
-
+ ::uint64_t phrasekey = (mfix.size() == sphrase.size() ? (mfix.getPid()<<1)
+ : (mdyn.getPid()<<1)+1);
size_t revision = dyn->revision();
{
boost::lock_guard<boost::mutex> guard(this->lock);
tpc_cache_t::iterator c = m_cache.find(phrasekey);
+ // TO DO: we should revise the revision mechanism: we take the length
+ // of the dynamic bitext (in sentences) at the time the PT entry
+ // was stored as the time stamp. For each word in the
+ // vocabulary, we also store its most recent occurrence in the
+ // bitext. Only if the timestamp of each word in the phrase is
+ // newer than the timestamp of the phrase itself we must update
+ // the entry.
if (c != m_cache.end() && c->second->revision == revision)
return encache(c->second);
}
- // not found or not up to date
+ // OK: pt entry not found or not up to date
+ // lookup and expansion could be done in parallel threds,
+ // but ppdyn is probably small anyway
+ // TO DO: have Bitexts return lists of PhrasePairs instead of pstats
+ // no need to expand pstats at every single lookup again, especially
+ // for btfix.
sptr<pstats> sfix,sdyn;
- if (mfix.size() == sphrase.size())
+ if (mfix.size() == sphrase.size())
sfix = btfix.lookup(mfix);
- if (mdyn.size() == sphrase.size())
- sdyn = dyn->lookup(mdyn);
-
- TargetPhraseCollectionWrapper*
- ret = new TargetPhraseCollectionWrapper(revision,phrasekey);
- if ((poolCounts &&
- pool_pstats(src, mfix.getPid(),sfix.get(),btfix,
- mdyn.getPid(),sdyn.get(),*dyn,ret))
- || combine_pstats(src, mfix.getPid(),sfix.get(),btfix,
- mdyn.getPid(),sdyn.get(),*dyn,ret))
+ if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(mdyn);
+
+ vector<PhrasePair<Token> > ppfix,ppdyn;
+ PhrasePair<Token>::SortByTargetIdSeq sort_by_tgt_id;
+ if (sfix)
+ {
+ expand(mfix, btfix, *sfix, ppfix);
+ sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id);
+ }
+ if (sdyn)
+ {
+ expand(mdyn, *dyn, *sdyn, ppdyn);
+ sort(ppdyn.begin(), ppdyn.end(),sort_by_tgt_id);
+ }
+
+ // now we have two lists of Phrase Pairs, let's merge them
+ TargetPhraseCollectionWrapper* ret;
+ ret = new TargetPhraseCollectionWrapper(revision,phrasekey);
+ PhrasePair<Token>::SortByTargetIdSeq sorter;
+ size_t i = 0; size_t k = 0;
+ while (i < ppfix.size() && k < ppdyn.size())
+ {
+ int cmp = sorter.cmp(ppfix[i], ppdyn[k]);
+ if (cmp < 0) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn));
+ else if (cmp == 0) ret->Add(mkTPhrase(src,&ppfix[i++],&ppdyn[k++],dyn));
+ else ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn));
+ }
+ while (i < ppfix.size()) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn));
+ while (k < ppdyn.size()) ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn));
+ if (m_tableLimit) ret->Prune(true, m_tableLimit);
+ else ret->Prune(true,ret->GetSize());
+#if 0
+ if (combine_pstats(src,
+ mfix.getPid(), sfix.get(), btfix,
+ mdyn.getPid(), sdyn.get(), *dyn, ret))
{
- ret->NthElement(m_tableLimit);
#if 0
sort(ret->begin(), ret->end(), CompareTargetPhrase());
cout << "SOURCE PHRASE: " << src << endl;
@@ -678,11 +642,22 @@ namespace Moses
}
#endif
}
+#endif
+
+ // put the result in the cache and return
boost::lock_guard<boost::mutex> guard(this->lock);
m_cache[phrasekey] = ret;
return encache(ret);
}
+ size_t
+ Mmsapt::
+ SetTableLimit(size_t limit)
+ {
+ std::swap(m_tableLimit,limit);
+ return limit;
+ }
+
void
Mmsapt::
CleanUpAfterSentenceProcessing(const InputType& source)
@@ -711,6 +686,7 @@ namespace Moses
// assert(0);
}
+#if defined(timespec)
bool operator<(timespec const& a, timespec const& b)
{
if (a.tv_sec != b.tv_sec) return a.tv_sec < b.tv_sec;
@@ -722,6 +698,19 @@ namespace Moses
if (a.tv_sec != b.tv_sec) return a.tv_sec > b.tv_sec;
return (a.tv_nsec >= b.tv_nsec);
}
+#endif
+
+ bool operator<(timeval const& a, timeval const& b)
+ {
+ if (a.tv_sec != b.tv_sec) return a.tv_sec < b.tv_sec;
+ return (a.tv_usec < b.tv_usec);
+ }
+
+ bool operator>=(timeval const& a, timeval const& b)
+ {
+ if (a.tv_sec != b.tv_sec) return a.tv_sec > b.tv_sec;
+ return (a.tv_usec >= b.tv_usec);
+ }
void
bubble_up(vector<Mmsapt::TargetPhraseCollectionWrapper*>& v, size_t k)
@@ -752,12 +741,10 @@ namespace Moses
decache(TargetPhraseCollectionWrapper* ptr) const
{
if (ptr->refCount || ptr->idx >= 0) return;
-
- timespec t; clock_gettime(CLOCK_MONOTONIC,&t);
- timespec r; clock_getres(CLOCK_MONOTONIC,&r);
-
// if (t.tv_nsec < v[0]->tstamp.tv_nsec)
#if 0
+ timespec t; clock_gettime(CLOCK_MONOTONIC,&t);
+ timespec r; clock_getres(CLOCK_MONOTONIC,&r);
float delta = t.tv_sec - ptr->tstamp.tv_sec;
cerr << "deleting old cache entry after "
<< delta << " seconds."
@@ -782,8 +769,11 @@ namespace Moses
if (!ptr) return NULL;
++ptr->refCount;
++m_tpc_ctr;
+#if defined(timespec)
clock_gettime(CLOCK_MONOTONIC, &ptr->tstamp);
-
+#else
+ gettimeofday(&ptr->tstamp, NULL);
+#endif
// update history
if (m_history.capacity() > 1)
{
@@ -816,6 +806,13 @@ namespace Moses
Mmsapt::
PrefixExists(Moses::Phrase const& phrase) const
{
+ return PrefixExists(phrase,NULL);
+ }
+
+ bool
+ Mmsapt::
+ PrefixExists(Moses::Phrase const& phrase, vector<float> const* const bias) const
+ {
if (phrase.GetSize() == 0) return false;
vector<id_type> myphrase;
fillIdSeq(phrase,input_factor,*btfix.V1,myphrase);
@@ -823,6 +820,7 @@ namespace Moses
TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size());
if (mfix.size() == myphrase.size())
{
+ btfix.prep(mfix,bias);
// cerr << phrase << " " << mfix.approxOccurrenceCount() << endl;
return true;
}
@@ -838,6 +836,8 @@ namespace Moses
{
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
mdyn.extend(myphrase[i]);
+ // let's assume a uniform bias over the foreground corpus
+ if (mdyn.size() == myphrase.size()) dyn->prep(mdyn,NULL);
}
return mdyn.size() == myphrase.size();
}
@@ -869,4 +869,11 @@ namespace Moses
return true;
}
+ string const&
+ Mmsapt::
+ GetName() const
+ {
+ return m_name;
+ }
+
}
diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h
index 5353a1c46..454422a85 100644
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@@ -19,6 +19,7 @@
#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
#include "moses/InputFileStream.h"
@@ -26,9 +27,11 @@
#include "moses/TargetPhrase.h"
#include <boost/dynamic_bitset.hpp>
#include "moses/TargetPhraseCollection.h"
+#include "util/usage.hh"
#include <map>
#include "moses/TranslationModel/PhraseDictionary.h"
+#include "sapt_phrase_scorers.h"
// TO DO:
// - make lexical phrase scorer take addition to the "dynamic overlay" into account
@@ -46,42 +49,85 @@ namespace Moses
#endif
{
friend class Alignment;
+ map<string,string> param;
+ vector<float> bias;
+ string m_name;
public:
typedef L2R_Token<SimpleWordId> Token;
typedef mmBitext<Token> mmbitext;
typedef imBitext<Token> imbitext;
+ typedef Bitext<Token> bitext;
typedef TSA<Token> tsa;
+ typedef PhraseScorer<Token> pscorer;
private:
+ // vector<sptr<bitext> > shards;
mmbitext btfix;
- sptr<imbitext> btdyn;
- string bname,extra_data;
+ sptr<imbitext> btdyn;
+ string bname,extra_data,bias_file;
string L1;
string L2;
- float m_lbop_parameter;
+ float m_lbop_conf; // confidence level for lbop smoothing
+ float m_lex_alpha; // alpha paramter (j+a)/(m+a) for lexical smoothing
+ // alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha)
+ // must be > 0 if dynamic
size_t m_default_sample_size;
size_t m_workers; // number of worker threads for sampling the bitexts
- char m_pfwd_denom; // denominator for computation of fwd phrase score:
- // 'r' - divide by raw count
- // 's' - divide by sample count
- // 'g' - devide by number of "good" (i.e. coherent) samples
- // size_t num_features;
+ vector<string> m_feature_set_names; // one or more of: standard, datasource
+
+ // // deprecated!
+ // char m_pfwd_denom; // denominator for computation of fwd phrase score:
+ // // 'r' - divide by raw count
+ // // 's' - divide by sample count
+ // // 'g' - devide by number of "good" (i.e. coherent) samples
+ // // size_t num_features;
+
size_t input_factor;
size_t output_factor; // we can actually return entire Tokens!
+
+ // bool withLogCountFeatures; // add logs of counts as features?
+ // bool withCoherence;
+ // string m_pfwd_features; // which pfwd functions to use
+ // string m_pbwd_features; // which pbwd functions to use
+
+ // for display for human inspection (ttable dumps):
+ vector<string> m_feature_names; // names of features activated
+ vector<bool> m_is_logval; // keeps track of which features are log valued
+ vector<bool> m_is_integer; // keeps track of which features are integer valued
+
+ vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix)
+ vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn)
+ vector<sptr<pscorer > > m_active_ff_common; // activated feature functions (dyn)
+
+ void
+ register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry);
+
+ template<typename fftype>
+ void
+ check_ff(string const ffname,vector<sptr<pscorer> >* registry = NULL);
+ // add feature function if specified
+
+ template<typename fftype>
+ void
+ check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry = NULL);
+ // add feature function if specified
+
+ void
+ add_corpus_specific_features(vector<sptr<pscorer > >& ffvec);
+
// built-in feature functions
- PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
- PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
- PScoreLex<Token> calc_lex; // this one I'd like to see as an external ff eventually
- PScorePP<Token> apply_pp; // apply phrase penalty
- PScoreLogCounts<Token> add_logcounts_fix;
- PScoreLogCounts<Token> add_logcounts_dyn;
+ // PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
+ // PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
+ // PScoreLex<Token> calc_lex; // this one I'd like to see as an external ff eventually
+ // PScorePC<Token> apply_pp; // apply phrase penalty
+ // PScoreLogCounts<Token> add_logcounts_fix;
+ // PScoreLogCounts<Token> add_logcounts_dyn;
void init(string const& line);
mutable boost::mutex lock;
+ bool withPbwd;
bool poolCounts;
- bool withLogCountFeatures; // add logs of counts as features?
- bool withPfwd,withPbwd;
vector<FactorType> ofactor;
-
+
public:
// typedef boost::unordered_map<uint64_t, sptr<TargetPhraseCollection> > tpcoll_cache_t;
class TargetPhraseCollectionWrapper
@@ -89,11 +135,15 @@ namespace Moses
{
public:
size_t const revision; // time stamp from dynamic bitext
- uint64_t const key; // phrase key
+ ::uint64_t const key; // phrase key
uint32_t refCount; // reference count
+#if defined(timespec)
timespec tstamp; // last use
+#else
+ timeval tstamp; // last use
+#endif
int idx; // position in history heap
- TargetPhraseCollectionWrapper(size_t r, uint64_t const k);
+ TargetPhraseCollectionWrapper(size_t r, ::uint64_t const k);
~TargetPhraseCollectionWrapper();
};
@@ -107,7 +157,7 @@ namespace Moses
void
decache(TargetPhraseCollectionWrapper* ptr) const;
- typedef map<uint64_t, TargetPhraseCollectionWrapper*> tpc_cache_t;
+ typedef map<typename ::uint64_t, TargetPhraseCollectionWrapper*> tpc_cache_t;
mutable tpc_cache_t m_cache;
mutable vector<TargetPhraseCollectionWrapper*> m_history;
// phrase table feature weights for alignment:
@@ -119,16 +169,28 @@ namespace Moses
mm2dtable_t COOCraw;
TargetPhrase*
- createTargetPhrase
+ mkTPhrase(Phrase const& src,
+ Moses::bitext::PhrasePair<Token>* fix,
+ Moses::bitext::PhrasePair<Token>* dyn,
+ sptr<Bitext<Token> > const& dynbt) const;
+
+ // template<typename Token>
+ // void
+ // expand(typename Bitext<Token>::iter const& m, Bitext<Token> const& bt,
+ // pstats const& pstats, vector<PhrasePair<Token> >& dest);
+
+#if 0
+ TargetPhrase*
+ mkTPhrase
(Phrase const& src,
Bitext<Token> const& bt,
- bitext::PhrasePair const& pp
+ Moses::bitext::PhrasePair const& pp
) const;
-
+#endif
void
process_pstats
(Phrase const& src,
- uint64_t const pid1,
+ ::uint64_t const pid1,
pstats const& stats,
Bitext<Token> const & bt,
TargetPhraseCollection* tpcoll
@@ -137,10 +199,10 @@ namespace Moses
bool
pool_pstats
(Phrase const& src,
- uint64_t const pid1a,
+ ::uint64_t const pid1a,
pstats * statsa,
Bitext<Token> const & bta,
- uint64_t const pid1b,
+ ::uint64_t const pid1b,
pstats const* statsb,
Bitext<Token> const & btb,
TargetPhraseCollection* tpcoll
@@ -149,25 +211,38 @@ namespace Moses
bool
combine_pstats
(Phrase const& src,
- uint64_t const pid1a,
+ ::uint64_t const pid1a,
pstats * statsa,
Bitext<Token> const & bta,
- uint64_t const pid1b,
+ ::uint64_t const pid1b,
pstats const* statsb,
Bitext<Token> const & btb,
TargetPhraseCollection* tpcoll
) const;
void
- load_extra_data(string bname);
+ load_extra_data(string bname, bool locking);
+
+ void
+ load_bias(string bname);
mutable size_t m_tpc_ctr;
public:
// Mmsapt(string const& description, string const& line);
Mmsapt(string const& line);
+
void
Load();
+
+ void
+ Load(bool with_checks);
+ // returns the prior table limit
+ size_t SetTableLimit(size_t limit);
+
+ string const&
+ GetName() const;
+
#ifndef NO_MOSES
TargetPhraseCollection const*
GetTargetPhraseCollectionLEGACY(const Phrase& src) const;
@@ -202,8 +277,23 @@ namespace Moses
/// return true if prefix /phrase/ exists
bool
+ PrefixExists(Phrase const& phrase, vector<float> const* const bias) const;
+
+ bool
PrefixExists(Phrase const& phrase) const;
+ vector<string> const&
+ GetFeatureNames() const;
+
+ // void
+ // ScorePPfix(bitext::PhrasePair& pp) const;
+
+ bool
+ isLogVal(int i) const;
+
+ bool
+ isInteger(int i) const;
+
private:
};
} // end namespace
diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc
index 4dd6081b0..8b6bf1eb2 100644
--- a/moses/TranslationModel/UG/mmsapt_align.cc
+++ b/moses/TranslationModel/UG/mmsapt_align.cc
@@ -1,334 +1,336 @@
#include "mmsapt.h"
+// currently broken
-namespace Moses
-{
- using namespace bitext;
- using namespace std;
- using namespace boost;
+// namespace Moses
+// {
+// using namespace bitext;
+// using namespace std;
+// using namespace boost;
- struct PPgreater
- {
- bool operator()(PhrasePair const& a, PhrasePair const& b)
- {
- return a.score > b.score;
- }
- };
+// struct PPgreater
+// {
+// bool operator()(PhrasePair const& a, PhrasePair const& b)
+// {
+// return a.score > b.score;
+// }
+// };
- void
- Mmsapt::
- setWeights(vector<float> const & w)
- {
- assert(w.size() == this->m_numScoreComponents);
- this->feature_weights = w;
- }
+// void
+// Mmsapt::
+// setWeights(vector<float> const & w)
+// {
+// assert(w.size() == this->m_numScoreComponents);
+// this->feature_weights = w;
+// }
- struct PhraseAlnHyp
- {
- PhrasePair pp;
- ushort s1,e1,s2,e2; // start and end positions
- int prev; // preceding alignment hypothesis
- float score;
- bitvector scov; // source coverage
- PhraseAlnHyp(PhrasePair const& ppx, int slen,
- pair<uint32_t,uint32_t> const& sspan,
- pair<uint32_t,uint32_t> const& tspan)
- : pp(ppx), prev(-1), score(ppx.score), scov(slen)
- {
- s1 = sspan.first; e1 = sspan.second;
- s2 = tspan.first; e2 = tspan.second;
- for (size_t i = s1; i < e1; ++i)
- scov.set(i);
- }
+// struct PhraseAlnHyp
+// {
+// PhrasePair pp;
+// ushort s1,e1,s2,e2; // start and end positions
+// int prev; // preceding alignment hypothesis
+// float score;
+// bitvector scov; // source coverage
+// PhraseAlnHyp(PhrasePair const& ppx, int slen,
+// pair<uint32_t,uint32_t> const& sspan,
+// pair<uint32_t,uint32_t> const& tspan)
+// : pp(ppx), prev(-1), score(ppx.score), scov(slen)
+// {
+// s1 = sspan.first; e1 = sspan.second;
+// s2 = tspan.first; e2 = tspan.second;
+// for (size_t i = s1; i < e1; ++i)
+// scov.set(i);
+// }
- bool operator<(PhraseAlnHyp const& other) const
- {
- return this->score < other.score;
- }
+// bool operator<(PhraseAlnHyp const& other) const
+// {
+// return this->score < other.score;
+// }
- bool operator>(PhraseAlnHyp const& other) const
- {
- return this->score > other.score;
- }
+// bool operator>(PhraseAlnHyp const& other) const
+// {
+// return this->score > other.score;
+// }
- PhraseOrientation
- po_bwd(PhraseAlnHyp const* prev) const
- {
- if (s2 == 0) return po_first;
- assert(prev);
- assert(prev->e2 <= s2);
- if (prev->e2 < s2) return po_other;
- if (prev->e1 == s1) return po_mono;
- if (prev->e1 < s1) return po_jfwd;
- if (prev->s1 == e1) return po_swap;
- if (prev->s1 > e1) return po_jbwd;
- return po_other;
- }
+// PhraseOrientation
+// po_bwd(PhraseAlnHyp const* prev) const
+// {
+// if (s2 == 0) return po_first;
+// assert(prev);
+// assert(prev->e2 <= s2);
+// if (prev->e2 < s2) return po_other;
+// if (prev->e1 == s1) return po_mono;
+// if (prev->e1 < s1) return po_jfwd;
+// if (prev->s1 == e1) return po_swap;
+// if (prev->s1 > e1) return po_jbwd;
+// return po_other;
+// }
- PhraseOrientation
- po_fwd(PhraseAlnHyp const* next) const
- {
- if (!next) return po_last;
- assert(next->s2 >= e2);
- if (next->s2 < e2) return po_other;
- if (next->e1 == s1) return po_swap;
- if (next->e1 < s1) return po_jbwd;
- if (next->s1 == e1) return po_mono;
- if (next->s1 > e1) return po_jfwd;
- return po_other;
- }
+// PhraseOrientation
+// po_fwd(PhraseAlnHyp const* next) const
+// {
+// if (!next) return po_last;
+// assert(next->s2 >= e2);
+// if (next->s2 < e2) return po_other;
+// if (next->e1 == s1) return po_swap;
+// if (next->e1 < s1) return po_jbwd;
+// if (next->s1 == e1) return po_mono;
+// if (next->s1 > e1) return po_jfwd;
+// return po_other;
+// }
- float
- dprob_fwd(PhraseAlnHyp const& next)
- {
- return pp.dfwd[po_fwd(&next)];
- }
+// float
+// dprob_fwd(PhraseAlnHyp const& next)
+// {
+// return pp.dfwd[po_fwd(&next)];
+// }
- float
- dprob_bwd(PhraseAlnHyp const& prev)
- {
- return pp.dbwd[po_bwd(&prev)];
- }
+// float
+// dprob_bwd(PhraseAlnHyp const& prev)
+// {
+// return pp.dbwd[po_bwd(&prev)];
+// }
- };
+// };
- class Alignment
- {
- typedef L2R_Token<SimpleWordId> Token;
- typedef TSA<Token> tsa;
- typedef pair<uint32_t, uint32_t> span;
- typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
- typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
- typedef pstats::trg_map_t jStatsTable;
+// class Alignment
+// {
+// typedef L2R_Token<SimpleWordId> Token;
+// typedef TSA<Token> tsa;
+// typedef pair<uint32_t, uint32_t> span;
+// typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
+// typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
+// typedef pstats::trg_map_t jStatsTable;
- Mmsapt const& PT;
- vector<id_type> s,t;
- pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
- pid2span_t spid2span,tpid2span;
- vector<vector<sptr<pstats> > > spstats;
+// Mmsapt const& PT;
+// vector<id_type> s,t;
+// pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
+// pid2span_t spid2span,tpid2span;
+// vector<vector<sptr<pstats> > > spstats;
- vector<PhrasePair> PP;
- // position-independent phrase pair info
- public:
- vector<PhraseAlnHyp> PAH;
- vector<vector<int> > tpos2ahyp;
- // maps from target start positions to PhraseAlnHyps starting at
- // that position
+// vector<PhrasePair> PP;
+// // position-independent phrase pair info
+// public:
+// vector<PhraseAlnHyp> PAH;
+// vector<vector<int> > tpos2ahyp;
+// // maps from target start positions to PhraseAlnHyps starting at
+// // that position
- sptr<pstats> getPstats(span const& sspan);
- void fill_tspan_maps();
- void fill_sspan_maps();
- public:
- Alignment(Mmsapt const& pt, string const& src, string const& trg);
- void show(ostream& out);
- void show(ostream& out, PhraseAlnHyp const& ah);
- };
+// sptr<pstats> getPstats(span const& sspan);
+// void fill_tspan_maps();
+// void fill_sspan_maps();
+// public:
+// Alignment(Mmsapt const& pt, string const& src, string const& trg);
+// void show(ostream& out);
+// void show(ostream& out, PhraseAlnHyp const& ah);
+// };
- void
- Alignment::
- show(ostream& out, PhraseAlnHyp const& ah)
- {
- LexicalPhraseScorer2<Token>::table_t const&
- COOCjnt = PT.calc_lex.scorer.COOC;
+// void
+// Alignment::
+// show(ostream& out, PhraseAlnHyp const& ah)
+// {
+// #if 0
+// LexicalPhraseScorer2<Token>::table_t const&
+// COOCjnt = PT.calc_lex.scorer.COOC;
- out << setw(10) << exp(ah.score) << " "
- << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
- << " <=> "
- << PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
- vector<uchar> const& a = ah.pp.aln;
- // BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
- for (size_t u = 0; u+1 < a.size(); u += 2)
- out << " " << int(a[u+1]) << "-" << int(a[u]);
+// out << setw(10) << exp(ah.score) << " "
+// << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
+// << " <=> "
+// << PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
+// vector<uchar> const& a = ah.pp.aln;
+// // BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
+// for (size_t u = 0; u+1 < a.size(); u += 2)
+// out << " " << int(a[u+1]) << "-" << int(a[u]);
- if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
- out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
- << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
- << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
- out << endl;
- // float const* ofwdj = ah.pp.dfwd;
- // float const* obwdj = ah.pp.dbwd;
- // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
- // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
- // out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
- // << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
- // << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
- // << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
- // << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
- // << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
- // << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
- // << "]" << endl
- // << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
- // << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
- // << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
- // << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
- // << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
- // << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
- // << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
- // << "]" << endl;
- }
+// if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
+// out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
+// << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
+// << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
+// out << endl;
+// // float const* ofwdj = ah.pp.dfwd;
+// // float const* obwdj = ah.pp.dbwd;
+// // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
+// // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
+// // out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
+// // << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
+// // << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
+// // << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
+// // << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
+// // << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
+// // << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
+// // << "]" << endl
+// // << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
+// // << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
+// // << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
+// // << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
+// // << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
+// // << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
+// // << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
+// // << "]" << endl;
+// #endif
+// }
- void
- Alignment::
- show(ostream& out)
- {
- // show what we have so far ...
- for (size_t s2 = 0; s2 < t.size(); ++s2)
- {
- VectorIndexSorter<PhraseAlnHyp> foo(PAH);
- sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
- for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
- show(out,PAH[tpos2ahyp[s2][h]]);
- }
- }
+// void
+// Alignment::
+// show(ostream& out)
+// {
+// // show what we have so far ...
+// for (size_t s2 = 0; s2 < t.size(); ++s2)
+// {
+// VectorIndexSorter<PhraseAlnHyp> foo(PAH);
+// sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
+// for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
+// show(out,PAH[tpos2ahyp[s2][h]]);
+// }
+// }
- sptr<pstats>
- Alignment::
- getPstats(span const& sspan)
- {
- size_t k = sspan.second - sspan.first - 1;
- if (k < spstats[sspan.first].size())
- return spstats[sspan.first][k];
- else return sptr<pstats>();
- }
+// sptr<pstats>
+// Alignment::
+// getPstats(span const& sspan)
+// {
+// size_t k = sspan.second - sspan.first - 1;
+// if (k < spstats[sspan.first].size())
+// return spstats[sspan.first][k];
+// else return sptr<pstats>();
+// }
- void
- Alignment::
- fill_tspan_maps()
- {
- tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
- for (size_t i = 0; i < t.size(); ++i)
- {
- tsa::tree_iterator m(PT.btfix.I2.get());
- for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
- {
- uint64_t pid = m.getPid();
- tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
- tspan2pid[i][k] = pid;
- }
- }
- }
+// void
+// Alignment::
+// fill_tspan_maps()
+// {
+// tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
+// for (size_t i = 0; i < t.size(); ++i)
+// {
+// tsa::tree_iterator m(PT.btfix.I2.get());
+// for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
+// {
+// uint64_t pid = m.getPid();
+// tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
+// tspan2pid[i][k] = pid;
+// }
+// }
+// }
- void
- Alignment::
- fill_sspan_maps()
- {
- sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
- spstats.resize(s.size());
- for (size_t i = 0; i < s.size(); ++i)
- {
- tsa::tree_iterator m(PT.btfix.I1.get());
- for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
- {
- uint64_t pid = m.getPid();
- sspan2pid[i][k] = pid;
- pid2span_t::iterator p = spid2span.find(pid);
- if (p != spid2span.end())
- {
- int x = p->second[0].first;
- int y = p->second[0].second-1;
- spstats[i].push_back(spstats[x][y-x]);
- }
- else
- {
- spstats[i].push_back(PT.btfix.lookup(m));
- cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
- << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
- << endl;
- }
- spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
- }
- }
- }
+// void
+// Alignment::
+// fill_sspan_maps()
+// {
+// sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
+// spstats.resize(s.size());
+// for (size_t i = 0; i < s.size(); ++i)
+// {
+// tsa::tree_iterator m(PT.btfix.I1.get());
+// for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
+// {
+// uint64_t pid = m.getPid();
+// sspan2pid[i][k] = pid;
+// pid2span_t::iterator p = spid2span.find(pid);
+// if (p != spid2span.end())
+// {
+// int x = p->second[0].first;
+// int y = p->second[0].second-1;
+// spstats[i].push_back(spstats[x][y-x]);
+// }
+// else
+// {
+// spstats[i].push_back(PT.btfix.lookup(m));
+// cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
+// << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
+// << endl;
+// }
+// spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
+// }
+// }
+// }
- Alignment::
- Alignment(Mmsapt const& pt, string const& src, string const& trg)
- : PT(pt)
- {
- PT.btfix.V1->fillIdSeq(src,s);
- PT.btfix.V2->fillIdSeq(trg,t);
+// Alignment::
+// Alignment(Mmsapt const& pt, string const& src, string const& trg)
+// : PT(pt)
+// {
+// PT.btfix.V1->fillIdSeq(src,s);
+// PT.btfix.V2->fillIdSeq(trg,t);
- // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
- // BOOST_FOREACH(id_type i, t)
- // {
- // cout << (*PT.btfix.V2)[i];
- // if (i < PT.wlex21.size())
- // {
- // BOOST_FOREACH(id_type k, PT.wlex21[i])
- // {
- // size_t j = COOC[k][i];
- // size_t m1 = COOC.m1(k);
- // size_t m2 = COOC.m2(i);
- // if (j*1000 > m1 && j*1000 > m2)
- // cout << " " << (*PT.btfix.V1)[k];
- // }
- // }
- // cout << endl;
- // }
+// // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
+// // BOOST_FOREACH(id_type i, t)
+// // {
+// // cout << (*PT.btfix.V2)[i];
+// // if (i < PT.wlex21.size())
+// // {
+// // BOOST_FOREACH(id_type k, PT.wlex21[i])
+// // {
+// // size_t j = COOC[k][i];
+// // size_t m1 = COOC.m1(k);
+// // size_t m2 = COOC.m2(i);
+// // if (j*1000 > m1 && j*1000 > m2)
+// // cout << " " << (*PT.btfix.V1)[k];
+// // }
+// // }
+// // cout << endl;
+// // }
- fill_tspan_maps();
- fill_sspan_maps();
- tpos2ahyp.resize(t.size());
- // now fill the association score table
- PAH.reserve(1000000);
- typedef pid2span_t::iterator psiter;
- for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
- {
- if (!L->second.size()) continue; // should never happen anyway
- int i = L->second[0].first;
- int k = L->second[0].second - i -1;
- sptr<pstats> ps = spstats[i][k];
- PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
- jStatsTable & J = ps->trg;
- for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
- {
- psiter R = tpid2span.find(y->first);
- if (R == tpid2span.end()) continue;
- pp.update(y->first, y->second);
- PT.calc_lex(PT.btfix,pp);
- PT.calc_pfwd_fix(PT.btfix,pp);
- PT.calc_pbwd_fix(PT.btfix,pp);
- pp.eval(PT.feature_weights);
- PP.push_back(pp);
- BOOST_FOREACH(span const& sspan, L->second)
- {
- BOOST_FOREACH(span const& tspan, R->second)
- {
- tpos2ahyp[tspan.first].push_back(PAH.size());
- PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
- }
- }
- }
- }
- }
+// fill_tspan_maps();
+// fill_sspan_maps();
+// tpos2ahyp.resize(t.size());
+// // now fill the association score table
+// PAH.reserve(1000000);
+// typedef pid2span_t::iterator psiter;
+// for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
+// {
+// if (!L->second.size()) continue; // should never happen anyway
+// int i = L->second[0].first;
+// int k = L->second[0].second - i -1;
+// sptr<pstats> ps = spstats[i][k];
+// PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
+// jStatsTable & J = ps->trg;
+// for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
+// {
+// psiter R = tpid2span.find(y->first);
+// if (R == tpid2span.end()) continue;
+// pp.update(y->first, y->second);
+// PT.ScorePPfix(pp);
+// pp.eval(PT.feature_weights);
+// PP.push_back(pp);
+// BOOST_FOREACH(span const& sspan, L->second)
+// {
+// BOOST_FOREACH(span const& tspan, R->second)
+// {
+// tpos2ahyp[tspan.first].push_back(PAH.size());
+// PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
+// }
+// }
+// }
+// }
+// }
- int
- extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
- {
- if ((PAH[edge].scov & PAH[next].scov).count())
- return -1;
- int ret = PAH.size();
- PAH.push_back(PAH[next]);
- PhraseAlnHyp & h = PAH.back();
- h.prev = edge;
- h.scov |= PAH[edge].scov;
- h.score += log(PAH[edge].dprob_fwd(PAH[next]));
- h.score += log(PAH[next].dprob_bwd(PAH[edge]));
- return ret;
- }
+// int
+// extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
+// {
+// if ((PAH[edge].scov & PAH[next].scov).count())
+// return -1;
+// int ret = PAH.size();
+// PAH.push_back(PAH[next]);
+// PhraseAlnHyp & h = PAH.back();
+// h.prev = edge;
+// h.scov |= PAH[edge].scov;
+// h.score += log(PAH[edge].dprob_fwd(PAH[next]));
+// h.score += log(PAH[next].dprob_bwd(PAH[edge]));
+// return ret;
+// }
+
+// sptr<vector<int> >
+// Mmsapt::
+// align(string const& src, string const& trg) const
+// {
+// // For the time being, we consult only the fixed bitext.
+// // We might also consider the dynamic bitext. => TO DO.
+// Alignment A(*this,src,trg);
+// VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
+// vector<size_t> o; foo.GetOrder(o);
+// BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
+// sptr<vector<int> > aln;
+// return aln;
+// }
+// }
- sptr<vector<int> >
- Mmsapt::
- align(string const& src, string const& trg) const
- {
- // For the time being, we consult only the fixed bitext.
- // We might also consider the dynamic bitext. => TO DO.
- Alignment A(*this,src,trg);
- VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
- vector<size_t> o; foo.GetOrder(o);
- BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
- sptr<vector<int> > aln;
- return aln;
- }
-}
diff --git a/moses/TranslationModel/UG/ptable-describe-features.cc b/moses/TranslationModel/UG/ptable-describe-features.cc
new file mode 100644
index 000000000..dbd5accb9
--- /dev/null
+++ b/moses/TranslationModel/UG/ptable-describe-features.cc
@@ -0,0 +1,37 @@
+#include "mmsapt.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+int main()
+{
+ string line;
+ while(getline(cin,line))
+ {
+ if (line.empty()) continue;
+ size_t k = line.find_first_not_of(" ");
+ if (line.find("Mmsapt") != k &&
+ line.find("PhraseDictionaryBitextSampling") != k)
+ continue;
+ Mmsapt PT(line);
+ PT.Load(false);
+ cout << PT.GetName() << ":" << endl;
+ vector<string> const& fnames = PT.GetFeatureNames();
+ BOOST_FOREACH(string const& s, fnames)
+ cout << s << endl;
+ cout << endl;
+ }
+ exit(0);
+}
+
+
+
diff --git a/moses/TranslationModel/UG/ptable-lookup.cc b/moses/TranslationModel/UG/ptable-lookup.cc
new file mode 100644
index 000000000..2cbf89b16
--- /dev/null
+++ b/moses/TranslationModel/UG/ptable-lookup.cc
@@ -0,0 +1,123 @@
+#include "mmsapt.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+vector<FactorType> fo(1,FactorType(0));
+
+class SimplePhrase : public Moses::Phrase
+{
+ vector<FactorType> const m_fo; // factor order
+public:
+ SimplePhrase(): m_fo(1,FactorType(0)) {}
+
+ void init(string const& s)
+ {
+ istringstream buf(s); string w;
+ while (buf >> w)
+ {
+ Word wrd;
+ this->AddWord().CreateFromString(Input,m_fo,StringPiece(w),false,false);
+ }
+ }
+};
+
+class TargetPhraseIndexSorter
+{
+ TargetPhraseCollection const& my_tpc;
+ CompareTargetPhrase cmp;
+public:
+ TargetPhraseIndexSorter(TargetPhraseCollection const& tpc) : my_tpc(tpc) {}
+ bool operator()(size_t a, size_t b) const
+ {
+ return cmp(*my_tpc[a], *my_tpc[b]);
+ }
+};
+
+int main(int argc, char* argv[])
+{
+ Parameter params;
+ if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(&params, argv[0]))
+ exit(1);
+
+ StaticData const& global = StaticData::Instance();
+ global.SetVerboseLevel(0);
+ vector<FactorType> ifo = global.GetInputFactorOrder();
+
+ PhraseDictionary* PT = PhraseDictionary::GetColl()[0];
+ Mmsapt* mmsapt = dynamic_cast<Mmsapt*>(PT);
+ PhraseDictionaryTreeAdaptor* pdta = dynamic_cast<PhraseDictionaryTreeAdaptor*>(PT);
+ // vector<FeatureFunction*> const& ffs = FeatureFunction::GetFeatureFunctions();
+
+ if (!mmsapt && !pdta)
+ {
+ cerr << "Phrase table implementation not supported by this utility." << endl;
+ exit(1);
+ }
+
+ string line;
+ while (true)
+ {
+ Sentence phrase;
+ if (!phrase.Read(cin,ifo)) break;
+ if (pdta)
+ {
+ pdta->InitializeForInput(phrase);
+ // do we also need to call CleanupAfterSentenceProcessing at the end?
+ }
+ Phrase& p = phrase;
+
+ cout << p << endl;
+ TargetPhraseCollection const* trg = PT->GetTargetPhraseCollectionLEGACY(p);
+ if (!trg) continue;
+ vector<size_t> order(trg->GetSize());
+ for (size_t i = 0; i < order.size(); ++i) order[i] = i;
+ sort(order.begin(),order.end(),TargetPhraseIndexSorter(*trg));
+ size_t k = 0;
+ // size_t precision =
+ cout.precision(2);
+
+ vector<string> fname;
+ if (mmsapt)
+ {
+ fname = mmsapt->GetFeatureNames();
+ cout << " ";
+ BOOST_FOREACH(string const& fn, fname)
+ cout << " " << format("%10.10s") % fn;
+ cout << endl;
+ }
+
+ BOOST_FOREACH(size_t i, order)
+ {
+ Phrase const& phr = static_cast<Phrase const&>(*(*trg)[i]);
+ cout << setw(3) << ++k << " " << phr << endl;
+ ScoreComponentCollection const& scc = (*trg)[i]->GetScoreBreakdown();
+ ScoreComponentCollection::IndexPair idx = scc.GetIndexes(PT);
+ FVector const& scores = scc.GetScoresVector();
+ cout << " ";
+ for (size_t k = idx.first; k < idx.second; ++k)
+ {
+ size_t j = k-idx.first;
+ float f = (mmsapt ? mmsapt->isLogVal(j) ? exp(scores[k]) : scores[k]
+ : scores[k] < 0 ? exp(scores[k]) : scores[k]);
+ string fmt = (mmsapt && mmsapt->isInteger(j)) ? "%10d" : "%10.8f";
+ cout << " " << format(fmt) % f;
+ }
+ cout << endl;
+ }
+ PT->Release(trg);
+ }
+ exit(0);
+}
+
+
+
diff --git a/moses/TranslationModel/UG/sapt_phrase_key.h b/moses/TranslationModel/UG/sapt_phrase_key.h
new file mode 100644
index 000000000..e1ecf1573
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_phrase_key.h
@@ -0,0 +1,13 @@
+//-*- c++ -*-
+#pragma once
+#include <stdint.h>
+
+using namespace std;
+namespace sapt
+{
+ using namespace Moses;
+ using namespace std;
+
+
+
+}
diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h
new file mode 100644
index 000000000..9870ed7f0
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h
@@ -0,0 +1,14 @@
+// -*- c++ -*-
+// Phrase scoring functions for suffix array-based phrase tables
+// written by Ulrich Germann
+#pragma once
+#include "sapt_pscore_unaligned.h" // count # of unaligned words
+#include "sapt_pscore_provenance.h" // reward for joint phrase occ. per corpus
+#include "sapt_pscore_rareness.h" // penalty for rare occurrences (global?)
+#include "sapt_pscore_logcnt.h" // logs of observed counts
+#include "sapt_pscore_lex1.h" // plain vanilla Moses lexical scores
+#include "sapt_pscore_pfwd.h" // fwd phrase prob
+#include "sapt_pscore_pbwd.h" // bwd phrase prob
+#include "sapt_pscore_coherence.h" // coherence feature: good/sample-size
+#include "sapt_pscore_phrasecount.h" // phrase count
+#include "sapt_pscore_wordcount.h" // word count
diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h
new file mode 100644
index 000000000..68a491145
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_base.h
@@ -0,0 +1,103 @@
+// -*- c++ -*-
+// Base classes for suffix array-based phrase scorers
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+
+namespace Moses {
+ namespace bitext
+ {
+
+ // abstract base class that defines the common API for phrase scorers
+ template<typename Token>
+ class
+ PhraseScorer
+ {
+ protected:
+ int m_index;
+ int m_num_feats;
+ string m_tag;
+ vector<string> m_feature_names;
+ public:
+
+ virtual
+ void
+ operator()(Bitext<Token> const& pt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest=NULL)
+ const = 0;
+
+ void
+ setIndex(int const i) { m_index = i; }
+
+ int
+ getIndex() const { return m_index; }
+
+ int
+ fcnt() const { return m_num_feats; }
+
+ vector<string> const &
+ fnames() const { return m_feature_names; }
+
+ string const &
+ fname(int i) const
+ {
+ if (i < 0) i += m_num_feats;
+ UTIL_THROW_IF2(i < 0 || i >= m_num_feats,
+ "Feature name index out of range at " << HERE);
+ return m_feature_names.at(i);
+ }
+
+ virtual
+ bool
+ isLogVal(int i) const { return true; };
+ // is this feature log valued?
+
+ virtual
+ bool
+ isIntegerValued(int i) const { return false; };
+ // is this feature integer valued (e.g., count features)?
+
+ virtual
+ bool
+ allowPooling() const { return true; }
+ // does this feature function allow pooling of counts if
+ // there are no occurrences in the respective corpus?
+
+ };
+
+ // base class for 'families' of phrase scorers that have a single
+ template<typename Token>
+ class
+ SingleRealValuedParameterPhraseScorerFamily
+ : public PhraseScorer<Token>
+ {
+ protected:
+ vector<float> m_x;
+
+ virtual
+ void
+ init(string const specs)
+ {
+ using namespace boost;
+ UTIL_THROW_IF2(this->m_tag.size() == 0,
+ "m_tag must be initialized in constructor");
+ UTIL_THROW_IF2(specs.size() == 0,"empty specification string!");
+ UTIL_THROW_IF2(this->m_feature_names.size(),
+ "PhraseScorer can only be initialized once!");
+ this->m_index = -1;
+ float x; char c;
+ for (istringstream buf(specs); buf>>x; buf>>c)
+ {
+ this->m_x.push_back(x);
+ string fname = (format("%s-%.2f") % this->m_tag % x).str();
+ this->m_feature_names.push_back(fname);
+ }
+ this->m_num_feats = this->m_x.size();
+ }
+ };
+ } // namespace bitext
+} // namespace moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h
new file mode 100644
index 000000000..a3211df54
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h
@@ -0,0 +1,33 @@
+// -*- c++ -*-
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScoreCoherence : public PhraseScorer<Token>
+ {
+ public:
+ PScoreCoherence(string const dummy)
+ {
+ this->m_index = -1;
+ this->m_num_feats = 1;
+ this->m_feature_names.push_back(string("coherence"));
+ }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ (*dest)[this->m_index] = log(pp.good1) - log(pp.sample1);
+ }
+ };
+ }
+}
diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h
new file mode 100644
index 000000000..be994b0d3
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h
@@ -0,0 +1,70 @@
+// -*- c++ -*-
+// Phrase scorer that counts the number of unaligend words in the phrase
+// written by Ulrich Germann
+
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScoreLex1 : public PhraseScorer<Token>
+ {
+ float m_alpha;
+ public:
+ LexicalPhraseScorer2<Token> scorer;
+
+ PScoreLex1(string const& alpaspec, string const& lexfile)
+ {
+ this->m_index = -1;
+ this->m_num_feats = 2;
+ this->m_feature_names.reserve(2);
+ this->m_feature_names.push_back("lexfwd");
+ this->m_feature_names.push_back("lexbwd");
+ m_alpha = atof(alpaspec.c_str());
+ scorer.open(lexfile);
+ }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
+ // parse_pid(pp.p1, sid1, off1, len1);
+ // parse_pid(pp.p2, sid2, off2, len2);
+#if 0
+ cout << len1 << " " << len2 << endl;
+ Token const* t1 = bt.T1->sntStart(sid1);
+ for (size_t i = off1; i < off1 + len1; ++i)
+ cout << (*bt.V1)[t1[i].id()] << " ";
+ cout << __FILE__ << ":" << __LINE__ << endl;
+
+ Token const* t2 = bt.T2->sntStart(sid2);
+ for (size_t i = off2; i < off2 + len2; ++i)
+ cout << (*bt.V2)[t2[i].id()] << " ";
+ cout << __FILE__ << ":" << __LINE__ << endl;
+
+ BOOST_FOREACH (int a, pp.aln)
+ cout << a << " " ;
+ cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
+
+ scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
+ bt.T2->sntStart(sid2)+off2,0,len2,
+ pp.aln, m_alpha,
+ (*dest)[this->m_index],
+ (*dest)[this->m_index+1]);
+#endif
+ scorer.score(pp.start1,0, pp.len1,
+ pp.start2,0, pp.len2, pp.aln, m_alpha,
+ (*dest)[this->m_index],
+ (*dest)[this->m_index+1]);
+ }
+ };
+ } //namespace bitext
+} // namespace Moses
+
diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
new file mode 100644
index 000000000..2790323ed
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
@@ -0,0 +1,65 @@
+// -*- c++ -*-
+// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
+// with the asymptotic function x/(j+x) where x > 0 is a function
+// parameter that determines the steepness of the rewards curve
+// written by Ulrich Germann
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+namespace Moses {
+ namespace bitext {
+
+ template<typename Token>
+ class
+ PScoreLogCnt : public PhraseScorer<Token>
+ {
+ string m_specs;
+ public:
+ PScoreLogCnt(string const specs)
+ {
+ this->m_index = -1;
+ this->m_specs = specs;
+ if (specs.find("r1") != string::npos) // raw source phrase counts
+ this->m_feature_names.push_back("log-r1");
+ if (specs.find("s1") != string::npos)
+ this->m_feature_names.push_back("log-s1"); // L1 sample size
+ if (specs.find("g1") != string::npos) // coherent phrases
+ this->m_feature_names.push_back("log-g1");
+ if (specs.find("j") != string::npos) // joint counts
+ this->m_feature_names.push_back("log-j");
+ if (specs.find("r2") != string::npos) // raw target phrase counts
+ this->m_feature_names.push_back("log-r2");
+ this->m_num_feats = this->m_feature_names.size();
+ }
+
+ bool
+ isIntegerValued(int i) const { return true; }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ assert(pp.raw1);
+ assert(pp.sample1);
+ assert(pp.good1);
+ assert(pp.joint);
+ assert(pp.raw2);
+ size_t i = this->m_index;
+ if (m_specs.find("r1") != string::npos)
+ (*dest)[i++] = log(pp.raw1);
+ if (m_specs.find("s1") != string::npos)
+ (*dest)[i++] = log(pp.sample1);
+ if (m_specs.find("g1") != string::npos)
+ (*dest)[i++] = log(pp.good1);
+ if (m_specs.find("j") != string::npos)
+ (*dest)[i++] = log(pp.joint);
+ if (m_specs.find("r2") != string::npos)
+ (*dest)[++i] = log(pp.raw2);
+ }
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
new file mode 100644
index 000000000..f7b4686d7
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
@@ -0,0 +1,58 @@
+//-*- c++ -*-
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+#include "boost/foreach.hpp"
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScorePbwd : public PhraseScorer<Token>
+ {
+ float conf;
+ string denom;
+
+ public:
+ PScorePbwd(float const c, string d)
+ {
+ this->m_index = -1;
+ conf = c;
+ denom = d;
+ size_t checksum = d.size();
+ BOOST_FOREACH(char const& x, denom)
+ {
+ if (x == '+') { --checksum; continue; }
+ if (x != 'g' && x != 's' && x != 'r') continue;
+ string s = (format("pbwd-%c%.3f") % x % c).str();
+ this->m_feature_names.push_back(s);
+ }
+ this->m_num_feats = this->m_feature_names.size();
+ UTIL_THROW_IF2(this->m_feature_names.size() != checksum,
+ "Unknown parameter in specification '"
+ << d << "' for Pbwd phrase scorer at " << HERE);
+ }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ // we use the denominator specification to scale the raw counts on the
+ // target side; the clean way would be to counter-sample
+ size_t i = this->m_index;
+ BOOST_FOREACH(char const& x, denom)
+ {
+ uint32_t m2 = pp.raw2;
+ if (x == 'g') m2 = round(m2 * float(pp.good1) / pp.raw1);
+ else if (x == 's') m2 = round(m2 * float(pp.sample1) / pp.raw1);
+ (*dest)[i++] = log(lbop(max(m2, pp.joint),pp.joint,conf));
+ }
+ }
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
new file mode 100644
index 000000000..ed48a93d2
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
@@ -0,0 +1,70 @@
+// -*- c++ -*-
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+#include "boost/foreach.hpp"
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScorePfwd : public PhraseScorer<Token>
+ {
+ float conf;
+ string denom;
+
+ public:
+
+ PScorePfwd(float const c, string d)
+ {
+ this->m_index = -1;
+ conf = c;
+ denom = d;
+ size_t checksum = d.size();
+ BOOST_FOREACH(char const& x, denom)
+ {
+ if (x == '+') { --checksum; continue; }
+ if (x != 'g' && x != 's' && x != 'r') continue;
+ string s = (format("pfwd-%c%.3f") % x % c).str();
+ this->m_feature_names.push_back(s);
+ }
+ this->m_num_feats = this->m_feature_names.size();
+ UTIL_THROW_IF2(this->m_feature_names.size() != checksum,
+ "Unknown parameter in specification '"
+ << d << "' for Pfwd phrase scorer at " << HERE);
+ }
+
+ void
+ operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ if (pp.joint > pp.good1)
+ {
+ pp.joint = pp.good1;
+ // cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
+ // cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
+ }
+ size_t i = this->m_index;
+ BOOST_FOREACH(char const& c, this->denom)
+ {
+ switch (c)
+ {
+ case 'g':
+ (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf));
+ break;
+ case 's':
+ (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf));
+ break;
+ case 'r':
+ (*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf));
+ }
+ }
+ }
+ };
+ }
+}
+
diff --git a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
new file mode 100644
index 000000000..e0a6eb48b
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
@@ -0,0 +1,34 @@
+// -*- c++ -*-
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+#include "sapt_pscore_base.h"
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScorePC : public PhraseScorer<Token>
+ {
+ public:
+ PScorePC(string const dummy)
+ {
+ this->m_index = -1;
+ this->m_num_feats = 1;
+ this->m_feature_names.push_back(string("phrasecount"));
+ }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ (*dest)[this->m_index] = 1;
+ }
+ };
+ }
+}
diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h
new file mode 100644
index 000000000..c33b98fe7
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h
@@ -0,0 +1,47 @@
+// -*- c++ -*-
+// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
+// with the asymptotic function j/(j+x) where x > 0 is a function
+// parameter that determines the steepness of the rewards curve
+// written by Ulrich Germann
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+namespace Moses {
+ namespace bitext {
+
+ // asymptotic provenance feature n/(n+x)
+ template<typename Token>
+ class
+ PScoreProvenance : public SingleRealValuedParameterPhraseScorerFamily<Token>
+ {
+ public:
+
+ PScoreProvenance(string const& spec)
+ {
+ this->m_tag = "prov";
+ this->init(spec);
+ }
+
+ bool
+ isLogVal(int i) const { return false; }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ size_t i = this->m_index;
+ BOOST_FOREACH(float const x, this->m_x)
+ (*dest).at(i++) = pp.joint/(x + pp.joint);
+ }
+
+ bool
+ allowPooling() const
+ { return false; }
+
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h
new file mode 100644
index 000000000..58f204c88
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h
@@ -0,0 +1,41 @@
+// -*- c++ -*-
+// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
+// with the asymptotic function x/(j+x) where x > 0 is a function
+// parameter that determines the steepness of the rewards curve
+// written by Ulrich Germann
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+namespace Moses {
+ namespace bitext {
+
+ // rareness penalty: x/(n+x)
+ template<typename Token>
+ class
+ PScoreRareness : public SingleRealValuedParameterPhraseScorerFamily<Token>
+ {
+ public:
+ PScoreRareness(string const spec)
+ {
+ this->m_tag = "rare";
+ this->init(spec);
+ }
+
+ bool
+ isLogVal(int i) const { return false; }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ size_t i = this->m_index;
+ BOOST_FOREACH(float const x, this->m_x)
+ (*dest).at(i++) = x/(x + pp.joint);
+ }
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
new file mode 100644
index 000000000..dafc1e129
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
@@ -0,0 +1,67 @@
+// -*- c++ -*-
+// Phrase scorer that counts the number of unaligend words in the phrase
+// written by Ulrich Germann
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScoreUnaligned : public PhraseScorer<Token>
+ {
+ typedef boost::dynamic_bitset<typename ::uint64_t> bitvector;
+ public:
+ PScoreUnaligned(string const spec)
+ {
+ this->m_index = -1;
+ int f = this->m_num_feats = atoi(spec.c_str());
+ UTIL_THROW_IF2(f != 1 && f != 2,"unal parameter must be 1 or 2 at "<<HERE);
+ this->m_feature_names.resize(f);
+ if (f == 1)
+ this->m_feature_names[0] = "unal";
+ else
+ {
+ this->m_feature_names[0] = "unal-s";
+ this->m_feature_names[1] = "unal-t";
+ }
+ }
+
+ bool
+ isLogVal(int i) const { return false; }
+
+ bool
+ isIntegerValued(int i) const { return true; }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
+ // parse_pid(pp.p1, sid1, off1, len1);
+ // parse_pid(pp.p2, sid2, off2, len2);
+ bitvector check1(pp.len1),check2(pp.len2);
+ for (size_t i = 0; i < pp.aln.size(); )
+ {
+ check1.set(pp.aln[i++]);
+ check2.set(pp.aln.at(i++));
+ }
+
+ if (this->m_num_feats == 1)
+ {
+ (*dest)[this->m_index] = pp.len1 - check1.count();
+ (*dest)[this->m_index] += pp.len2 - check2.count();
+ }
+ else
+ {
+ (*dest)[this->m_index] = pp.len1 - check1.count();
+ (*dest)[this->m_index+1] = pp.len2 - check2.count();
+ }
+ }
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_wordcount.h b/moses/TranslationModel/UG/sapt_pscore_wordcount.h
new file mode 100644
index 000000000..3227bb6ba
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_wordcount.h
@@ -0,0 +1,34 @@
+// -*- c++ -*-
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+#include "sapt_pscore_base.h"
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScoreWC : public PhraseScorer<Token>
+ {
+ public:
+ PScoreWC(string const dummy)
+ {
+ this->m_index = -1;
+ this->m_num_feats = 1;
+ this->m_feature_names.push_back(string("wordcount"));
+ }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ (*dest)[this->m_index] = pp.len2;
+ }
+ };
+ }
+}
diff --git a/moses/TranslationModel/UG/sim-pe.cc b/moses/TranslationModel/UG/sim-pe.cc
new file mode 100644
index 000000000..460d66c1f
--- /dev/null
+++ b/moses/TranslationModel/UG/sim-pe.cc
@@ -0,0 +1,84 @@
+#include "mmsapt.h"
+#include "moses/Manager.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+vector<FactorType> fo(1,FactorType(0));
+
+ostream&
+operator<<(ostream& out, Hypothesis const* x)
+{
+ vector<const Hypothesis*> H;
+ for (const Hypothesis* h = x; h; h = h->GetPrevHypo())
+ H.push_back(h);
+ for (; H.size(); H.pop_back())
+ {
+ Phrase const& p = H.back()->GetCurrTargetPhrase();
+ for (size_t pos = 0 ; pos < p.GetSize() ; pos++)
+ out << *p.GetFactor(pos, 0) << (H.size() ? " " : "");
+ }
+ return out;
+}
+
+vector<FactorType> ifo;
+size_t lineNumber;
+
+string
+translate(string const& source)
+{
+ StaticData const& global = StaticData::Instance();
+
+ Sentence sentence;
+ istringstream ibuf(source+"\n");
+ sentence.Read(ibuf,ifo);
+
+ // Manager manager(lineNumber, sentence, global.GetSearchAlgorithm());
+ Manager manager(sentence, global.GetSearchAlgorithm());
+ manager.ProcessSentence();
+
+ ostringstream obuf;
+ const Hypothesis* h = manager.GetBestHypothesis();
+ obuf << h;
+ return obuf.str();
+
+}
+
+int main(int argc, char* argv[])
+{
+ Parameter params;
+ if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(&params, argv[0]))
+ exit(1);
+
+ StaticData const& global = StaticData::Instance();
+ global.SetVerboseLevel(0);
+ ifo = global.GetInputFactorOrder();
+
+ lineNumber = 0; // TODO: Include sentence request number here?
+ string source, target, alignment;
+ while (getline(cin,source))
+ {
+ getline(cin,target);
+ getline(cin,alignment);
+ cout << "[S] " << source << endl;
+ cout << "[H] " << translate(source) << endl;
+ cout << "[T] " << target << endl;
+ Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
+ pdsa->add(source,target,alignment);
+ cout << "[X] " << translate(source) << endl;
+ cout << endl;
+ }
+ exit(0);
+}
+
+
+
diff --git a/moses/TranslationModel/UG/spe-check-coverage.cc b/moses/TranslationModel/UG/spe-check-coverage.cc
new file mode 100644
index 000000000..039b4cd37
--- /dev/null
+++ b/moses/TranslationModel/UG/spe-check-coverage.cc
@@ -0,0 +1,214 @@
+#include "mmsapt.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+vector<FactorType> fo(1,FactorType(0));
+
+class SimplePhrase : public Moses::Phrase
+{
+ vector<FactorType> const m_fo; // factor order
+public:
+ SimplePhrase(): m_fo(1,FactorType(0)) {}
+
+ void init(string const& s)
+ {
+ istringstream buf(s); string w;
+ while (buf >> w)
+ {
+ Word wrd;
+ this->AddWord().CreateFromString(Input,m_fo,StringPiece(w),false,false);
+ }
+ }
+};
+
+class TargetPhraseIndexSorter
+{
+ TargetPhraseCollection const& my_tpc;
+ CompareTargetPhrase cmp;
+public:
+ TargetPhraseIndexSorter(TargetPhraseCollection const& tpc) : my_tpc(tpc) {}
+ bool operator()(size_t a, size_t b) const
+ {
+ // return cmp(*my_tpc[a], *my_tpc[b]);
+ return (my_tpc[a]->GetScoreBreakdown().GetWeightedScore() >
+ my_tpc[b]->GetScoreBreakdown().GetWeightedScore());
+ }
+};
+
+int main(int argc, char* argv[])
+{
+
+ string vlevel = "alt"; // verbosity level
+ vector<pair<string,int> > argfilter(5);
+ argfilter[0] = std::make_pair(string("--spe-src"),1);
+ argfilter[1] = std::make_pair(string("--spe-trg"),1);
+ argfilter[2] = std::make_pair(string("--spe-aln"),1);
+ argfilter[3] = std::make_pair(string("--spe-show"),1);
+
+ char** my_args; int my_acnt;
+ char** mo_args; int mo_acnt;
+ filter_arguments(argc, argv, mo_acnt, &mo_args, my_acnt, &my_args, argfilter);
+
+ ifstream spe_src,spe_trg,spe_aln;
+ // instead of translating show coverage by phrase tables
+ for (int i = 0; i < my_acnt; i += 2)
+ {
+ if (!strcmp(my_args[i],"--spe-src"))
+ spe_src.open(my_args[i+1]);
+ else if (!strcmp(my_args[i],"--spe-trg"))
+ spe_trg.open(my_args[i+1]);
+ else if (!strcmp(my_args[i],"--spe-aln"))
+ spe_aln.open(my_args[i+1]);
+ else if (!strcmp(my_args[i],"--spe-show"))
+ vlevel = my_args[i+1];
+ }
+
+ Parameter params;
+ if (!params.LoadParam(mo_acnt,mo_args) ||
+ !StaticData::LoadDataStatic(&params, mo_args[0]))
+ exit(1);
+
+ StaticData const& global = StaticData::Instance();
+ global.SetVerboseLevel(0);
+ vector<FactorType> ifo = global.GetInputFactorOrder();
+
+ PhraseDictionary* PT = PhraseDictionary::GetColl()[0];
+ Mmsapt* mmsapt = dynamic_cast<Mmsapt*>(PT);
+ if (!mmsapt)
+ {
+ cerr << "Phrase table implementation not supported by this utility." << endl;
+ exit(1);
+ }
+ mmsapt->SetTableLimit(0);
+
+ string srcline,trgline,alnline;
+ cout.precision(2);
+ vector<string> fname = mmsapt->GetFeatureNames();
+ while (getline(spe_src,srcline))
+ {
+ UTIL_THROW_IF2(!getline(spe_trg,trgline), HERE
+ << ": missing data for online updates.");
+ UTIL_THROW_IF2(!getline(spe_aln,alnline), HERE
+ << ": missing data for online updates.");
+ cout << string(80,'-') << "\n" << srcline << "\n" << trgline << "\n" << endl;
+
+ // cout << srcline << " " << HERE << endl;
+ Sentence snt;
+ istringstream buf(srcline+"\n");
+ if (!snt.Read(buf,ifo)) break;
+ // cout << Phrase(snt) << endl;
+ int dynprovidx = -1;
+ for (size_t i = 0; i < fname.size(); ++i)
+ {
+ if (fname[i].substr(0,7) == "prov-1.")
+ dynprovidx = i;
+ }
+ cout << endl;
+ for (size_t i = 0; i < snt.GetSize(); ++i)
+ {
+ for (size_t k = i; k < snt.GetSize(); ++k)
+ {
+ Phrase p = snt.GetSubString(WordsRange(i,k));
+ if (!mmsapt->PrefixExists(p)) break;
+ TargetPhraseCollection const* trg = PT->GetTargetPhraseCollectionLEGACY(p);
+ if (!trg || !trg->GetSize()) continue;
+
+ bool header_done = false;
+ bool has_dynamic_match = vlevel == "all" || vlevel == "ALL";
+ vector<size_t> order; order.reserve(trg->GetSize());
+ size_t stop = trg->GetSize();
+
+ vector<size_t> o2(trg->GetSize());
+ for (size_t i = 0; i < stop; ++i) o2[i] = i;
+ sort(o2.begin(),o2.end(),TargetPhraseIndexSorter(*trg));
+
+ for (size_t r = 0; r < stop; ++r) // r for rank
+ {
+ if (vlevel != "ALL")
+ {
+ Phrase const& phr = static_cast<Phrase const&>(*(*trg)[o2[r]]);
+ ostringstream buf; buf << phr;
+ string tphrase = buf.str();
+ tphrase.erase(tphrase.size()-1);
+ size_t s = trgline.find(tphrase);
+ if (s == string::npos) continue;
+ size_t e = s + tphrase.size();
+ if ((s && trgline[s-1] != ' ') || (e < trgline.size() && trgline[e] != ' '))
+ continue;
+ }
+ order.push_back(r);
+ if (!has_dynamic_match)
+ {
+ ScoreComponentCollection const& scc = (*trg)[o2[r]]->GetScoreBreakdown();
+ ScoreComponentCollection::IndexPair idx = scc.GetIndexes(PT);
+ FVector const& scores = scc.GetScoresVector();
+ has_dynamic_match = scores[idx.first + dynprovidx] > 0;
+ }
+ }
+ if ((vlevel == "alt" || vlevel == "new") && !has_dynamic_match)
+ continue;
+
+
+ BOOST_FOREACH(size_t const& r, order)
+ {
+ ScoreComponentCollection const& scc = (*trg)[o2[r]]->GetScoreBreakdown();
+ ScoreComponentCollection::IndexPair idx = scc.GetIndexes(PT);
+ FVector const& scores = scc.GetScoresVector();
+ float wscore = scc.GetWeightedScore();
+ if (vlevel == "new" && scores[idx.first + dynprovidx] == 0)
+ continue;
+ if (!header_done)
+ {
+ cout << endl;
+ if (trg->GetSize() == 1)
+ cout << p << " (1 translation option)" << endl;
+ else
+ cout << p << " (" << trg->GetSize() << " translation options)" << endl;
+ header_done = true;
+ }
+ Phrase const& phr = static_cast<Phrase const&>(*(*trg)[o2[r]]);
+ cout << setw(3) << r+1 << " " << phr << endl;
+ cout << " ";
+ BOOST_FOREACH(string const& fn, fname)
+ cout << " " << format("%10.10s") % fn;
+ cout << endl;
+ cout << " ";
+ for (size_t x = idx.first; x < idx.second; ++x)
+ {
+ size_t j = x-idx.first;
+ float f = (mmsapt && mmsapt->isLogVal(j)) ? exp(scores[x]) : scores[x];
+ string fmt = (mmsapt && mmsapt->isInteger(j)) ? "%10d" : "%10.8f";
+ if (fname[j].substr(0,3) == "lex") fmt = "%10.3e";
+ if (fname[j].substr(0,7) == "prov-1.")
+ {
+ f = round(f/(1-f));
+ fmt = "%10d";
+ }
+ cout << " " << format(fmt) % (mmsapt->isInteger(j) ? round(f) : f);
+ }
+ cout << " " << format("%10.3e") % exp(wscore)
+ << " " << format("%10.3e") % exp((*trg)[o2[r]]->GetFutureScore()) << endl;
+ }
+ mmsapt->Release(trg);
+ continue;
+ }
+ }
+ mmsapt->add(srcline,trgline,alnline);
+ }
+ // }
+ exit(0);
+}
+
+
+
diff --git a/moses/TranslationModel/UG/spe-check-coverage2.cc b/moses/TranslationModel/UG/spe-check-coverage2.cc
new file mode 100644
index 000000000..fa9ce1c85
--- /dev/null
+++ b/moses/TranslationModel/UG/spe-check-coverage2.cc
@@ -0,0 +1,76 @@
+#include "mmsapt.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+typedef L2R_Token<SimpleWordId> Token;
+typedef mmBitext<Token> mmbitext;
+typedef imBitext<Token> imbitext;
+typedef Bitext<Token>::iter iter;
+
+mmbitext bg;
+
+void
+show(ostream& out, iter& f)
+{
+ iter b(bg.I2.get(),f.getToken(0),f.size());
+ if (b.size() == f.size())
+ out << setw(12) << int(round(b.approxOccurrenceCount()));
+ else
+ out << string(12,' ');
+ out << " " << setw(5) << int(round(f.approxOccurrenceCount())) << " ";
+ out << f.str(bg.V1.get()) << endl;
+}
+
+
+void
+dump(ostream& out, iter& f)
+{
+ float cnt = f.size() ? f.approxOccurrenceCount() : 0;
+ if (f.down())
+ {
+ cnt = f.approxOccurrenceCount();
+ do { dump(out,f); }
+ while (f.over());
+ f.up();
+ }
+ if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1)
+ show(out,f);
+}
+
+
+void
+read_data(string fname, vector<string>& dest)
+{
+ ifstream in(fname.c_str());
+ string line;
+ while (getline(in,line)) dest.push_back(line);
+ in.close();
+}
+
+int main(int argc, char* argv[])
+{
+ bg.open(argv[1],argv[2],argv[3]);
+ sptr<imbitext> fg(new imbitext(bg.V1,bg.V2));
+ vector<string> src,trg,aln;
+ read_data(argv[4],src);
+ read_data(argv[5],trg);
+ read_data(argv[6],aln);
+ fg = fg->add(src,trg,aln);
+ iter mfg(fg->I1.get());
+ dump(cout,mfg);
+ exit(0);
+}
+
+
+
diff --git a/moses/TranslationModel/UG/spe-check-coverage3.cc b/moses/TranslationModel/UG/spe-check-coverage3.cc
new file mode 100644
index 000000000..ea8c85e99
--- /dev/null
+++ b/moses/TranslationModel/UG/spe-check-coverage3.cc
@@ -0,0 +1,194 @@
+#include "mmsapt.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+typedef L2R_Token<SimpleWordId> Token;
+typedef mmBitext<Token> mmbitext;
+typedef imBitext<Token> imbitext;
+typedef Bitext<Token>::iter iter;
+
+mmbitext bg;
+vector<string> src,trg,aln;
+
+void
+show(ostream& out, iter& f)
+{
+ iter b(bg.I2.get(),f.getToken(0),f.size());
+ if (b.size() == f.size())
+ out << setw(12) << int(round(b.approxOccurrenceCount()));
+ else
+ out << string(12,' ');
+ out << " " << setw(5) << int(round(f.approxOccurrenceCount())) << " ";
+ out << f.str(bg.V1.get()) << endl;
+}
+
+
+void
+dump(ostream& out, iter& f)
+{
+ float cnt = f.size() ? f.approxOccurrenceCount() : 0;
+ if (f.down())
+ {
+ cnt = f.approxOccurrenceCount();
+ do { dump(out,f); }
+ while (f.over());
+ f.up();
+ }
+ if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1)
+ show(out,f);
+}
+
+
+void
+read_data(string fname, vector<string>& dest)
+{
+ ifstream in(fname.c_str());
+ string line;
+ while (getline(in,line)) dest.push_back(line);
+ in.close();
+}
+
+void
+show_snt(ostream& out, TokenIndex const& V, vector<Token> const& snt,
+ vector<vector<int> > const& a)
+{
+ for (size_t i = 0; i < snt.size(); ++i)
+ {
+ cout << format("%d:%s[") % i % V[snt[i].id()];
+ for (size_t k = 0; k < a[i].size(); ++k)
+ cout << (k?",":"") << a[i][k];
+ cout << "] ";
+ }
+ cout << endl;
+}
+
+
+void show_pair(size_t const sid)
+{
+ vector<Token> s,t;
+ fill_token_seq(*bg.V1,src[sid],s);
+ fill_token_seq(*bg.V2,trg[sid],t);
+ vector<vector<int> > a1(s.size()),a2(t.size());
+ istringstream buf(aln[sid]);
+ cout << aln[sid] << endl;
+ int i,k; char c;
+ while (buf >> i >> c >> k)
+ {
+ a1[i].push_back(k);
+ a2[k].push_back(i);
+ cout << i << "-" << k << " ";
+ }
+ cout << endl;
+ show_snt(cout,*bg.V1,s,a1);
+ show_snt(cout,*bg.V2,t,a2);
+}
+
+int main(int argc, char* argv[])
+{
+ if (argc < 5)
+ {
+ cerr << "usage: " << argv[0]
+ << " <bg base name> <L1> <L2> <fg base name>"
+ << endl;
+ exit(1);
+ }
+ bg.open(argv[1],argv[2],argv[3]);
+ sptr<imbitext> fg(new imbitext(bg.V1,bg.V2));
+ string base = argv[4];
+ if (*base.rbegin() != '.') base += '.';
+ string srcfile = base + argv[2];
+ string trgfile = base + argv[3];
+ string alnfile = base + "symal";
+ read_data(srcfile,src);
+ read_data(trgfile,trg);
+ read_data(alnfile,aln);
+ fg = fg->add(src,trg,aln);
+
+ vector<float> bias(src.size(),1./(src.size()-1));
+ for (size_t sid = 0; sid < src.size(); ++sid)
+ {
+ bias[sid] = 0;
+ // cout << src[sid] << endl << trg[sid] << endl;
+ // show_pair(sid);
+ vector<Token> snt;
+ fill_token_seq(*bg.V1,src[sid],snt);
+ vector<vector<sptr<vector<PhrasePair<Token> > > > > FG,BG;
+ fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true);
+ bg.lookup(snt,*bg.I1,BG,NULL,NULL,NULL,true);
+ set<sptr<vector<PhrasePair<Token> > > > seen;
+ for (size_t i = 0; i < snt.size(); ++i)
+ {
+ Bitext<Token>::iter m0(fg->I1.get());
+ Bitext<Token>::iter m1(bg.I1.get());
+ for (size_t k = 0; k < FG[i].size(); ++k)
+ {
+ if (!m0.extend(snt[i+k].id())) break;
+ if (k && m0.approxOccurrenceCount() < 2) break;
+ if (m1.size() == k && (!m1.extend(snt[i+k].id()) ||
+ m1.approxOccurrenceCount() < 25))
+ {
+ cout << toString((*fg->V1), m0.getToken(0), m0.size()) << " "
+ << int(m0.approxOccurrenceCount());
+ if (m1.size() == k + 1)
+ cout << " "<< int(m1.approxOccurrenceCount());
+ else if (m1.size())
+ cout << " ["<< int(m1.approxOccurrenceCount()) << "]";
+ else
+ cout << " NEW!";
+ cout << endl;
+ }
+ if (m0.approxOccurrenceCount() < 2) break;
+ BOOST_FOREACH(PhrasePair<Token> const& pp, *FG[i][k])
+ {
+ if (pp.joint < 2) continue;
+ sptr<pstats> bgstats;
+ jstats const* bgjstats = NULL;
+ Bitext<Token>::iter m2(bg.I2.get(), pp.start2, pp.len2);
+ if (m1.approxOccurrenceCount() > 5000 ||
+ m2.approxOccurrenceCount() > 5000)
+ continue;
+ if (m1.size() == pp.len1 && m2.size() == pp.len2)
+ {
+ bgstats = bg.lookup(m1,NULL);
+ if (bgstats)
+ {
+ pstats::trg_map_t::const_iterator mx;
+ mx = bgstats->trg.find(m2.getPid());
+ if (mx != bgstats->trg.end())
+ bgjstats = &mx->second;
+ }
+ }
+ cout << toString(*fg->V1, pp.start1, pp.len1) << " ::: "
+ << toString(*fg->V2, pp.start2, pp.len2) << " "
+ << format("[%u/%u/%u]") % pp.good1 % pp.joint % pp.good2;
+ if (bgjstats)
+ cout << " " << (format("[%u/%u/%u]")
+ % bgstats->good % bgjstats->rcnt()
+ % (bgjstats->cnt2() * bgstats->good
+ / bgstats->raw_cnt));
+ else if (m1.size() == pp.len1)
+ cout << " " << int(m1.approxOccurrenceCount());
+ cout << endl;
+
+ }
+ }
+ }
+ bias[sid] = 1./(src.size()-1);
+ }
+ exit(0);
+}
+
+
+
diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc
index 30c87ccab..daafec545 100644
--- a/moses/TranslationModel/UG/try-align.cc
+++ b/moses/TranslationModel/UG/try-align.cc
@@ -1,33 +1,514 @@
-#include "mmsapt.h"
+#include "mm/ug_bitext.h"
+#include <boost/format.hpp>
using namespace std;
using namespace Moses;
+using namespace ugdiss;
+typedef L2R_Token<SimpleWordId> Token;
+typedef mmTtrack<Token> ttrack_t;
+typedef mmTSA<Token> tsa_t;
+
+TokenIndex V1,V2;
+boost::shared_ptr<ttrack_t> T1,T2;
+tsa_t I1,I2;
+
+float lbop_level = .05;
+#define smooth 1
+namespace stats
+{
+ using namespace Moses::bitext;
+ float
+ pmi(size_t j,size_t m1, size_t m2, size_t N)
+ {
+#if smooth
+ float p1 = lbop(N,m1,lbop_level);
+ float p2 = lbop(N,m2,lbop_level);
+ float p12 = lbop(N,j,lbop_level);
+ return log(p12) - log(p1) - log(p2);
+#else
+ return log(j) + log(N) - log(m1) - log(m2);
+#endif
+ }
+
+ float
+ npmi(size_t j,size_t m1, size_t m2, size_t N)
+ {
+#if smooth
+ float p1 = lbop(N,m1,lbop_level);
+ float p2 = lbop(N,m2,lbop_level);
+ float p12 = lbop(N,j,lbop_level);
+ return (log(p12) - log(p1) - log(p2)) / -log(p12);
+#else
+ return pmi(j,m1,m2,N) / (log(N) - log(j));
+#endif
+ }
+
+ float
+ mi(size_t j,size_t m1, size_t m2, size_t N)
+ {
+ float ret = 0;
+ if (j) ret += float(j)/N * pmi(j,m1,m2,N);
+ if (m1>j) ret += float(m1-j)/N * pmi(m1-j,m1,N-m2,N);
+ if (m2>j) ret += float(m2-j)/N * pmi(m2-j,N-m1,m2,N);
+ if (N>m1+m2-j) ret += float(N-m1-m2+j)/N * pmi(N-m1-m2+j,N-m1,N-m2,N);
+ return ret;
+ }
+}
+
+struct SinglePhrase
+{
+ typedef map<uint64_t,sptr<SinglePhrase> > cache_t;
+ uint64_t pid; // phrase id
+ vector<ttrack::Position> occs; // occurrences
+};
+
+
+struct PhrasePair
+{
+ struct score_t;
+ uint64_t p1,p2;
+ ushort s1,e1,s2,e2;
+ int parent;
+
+ struct stats_t
+ {
+ typedef map<pair<uint64_t,uint64_t>, sptr<stats_t> > cache_t;
+ size_t m1,m2,j;
+ float npmi; // normalized point-wise mutual information
+ float pmi; // point-wise mutual information
+ float mi; // mutual information
+ float score;
+
+ void
+ set(vector<ttrack::Position> const& o1,
+ vector<ttrack::Position> const& o2,
+ size_t const N)
+ {
+ m1 = m2 = j = 0;
+ size_t i1=0,i2=0;
+ while (i1 < o1.size() && i2 < o2.size())
+ {
+ if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; }
+ if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; }
+
+ if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; }
+ else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; }
+ else { ++i2; ++m2; }
+ }
+ // for (++i1; i1 < o1.size(); ++i1)
+ // if (o1[i1-1].sid != o1[i1].sid) ++m1;
+ // for (++i2; i2 < o2.size(); ++i2)
+ // if (o2[i2-1].sid != o2[i2].sid) ++m2;
+
+ m1 = 1; m2 = 1;
+ for (i1=1; i1 < o1.size(); ++i1)
+ if (o1[i1-1].sid != o1[i1].sid) ++m1;
+ for (i2=1; i2 < o2.size(); ++i2)
+ if (o2[i2-1].sid != o2[i2].sid) ++m2;
+
+ this->mi = stats::mi(j,m1,m2,N);
+ this->pmi = stats::pmi(j,m1,m2,N);
+ this->npmi = stats::npmi(j,m1,m2,N);
+ // float z = float(m1)/N * float(m2)/N;
+ // float hmean = 2.*j/(m1+m2);
+ this->score = npmi; // npmi; // hmean; // /sqrt(z);
+ }
+ } stats;
+
+ PhrasePair(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0)
+ : s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { }
+
+
+ bool
+ operator<(PhrasePair const& other) const
+ {
+ return (this->stats.score == other.stats.score
+ ? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2)
+ : (this->stats.score > other.stats.score));
+ }
+
+ size_t len1() const { return e1 - s1; }
+ size_t len2() const { return e2 - s2; }
+ bool includes(PhrasePair const& o) const
+ {
+ return s1 <= o.s1 && e1 >= o.e1 && s2 <= o.s2 && e2 >= o.e2;
+ }
+
+};
+
+SinglePhrase::cache_t cache1,cache2;
+PhrasePair::stats_t::cache_t ppcache;
+
+
+struct SortByPositionInCorpus
+{
+ bool
+ operator()(ttrack::Position const& a,
+ ttrack::Position const& b) const
+ {
+ return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset;
+ }
+};
+
+
+void
+getoccs(tsa_t::tree_iterator const& m,
+ vector<ttrack::Position>& occs)
+{
+ occs.clear();
+ occs.reserve(m.approxOccurrenceCount()+10);
+ tsa::ArrayEntry I(m.lower_bound(-1));
+ char const* stop = m.upper_bound(-1);
+ do {
+ m.root->readEntry(I.next,I);
+ occs.push_back(I);
+ } while (I.next != stop);
+ sort(occs.begin(),occs.end(),SortByPositionInCorpus());
+}
+
+void
+lookup_phrases(vector<id_type> const& snt,
+ TokenIndex& V, ttrack_t const& T,
+ tsa_t const& I, SinglePhrase::cache_t& cache,
+ vector<vector<sptr<SinglePhrase> > >& dest)
+{
+ dest.resize(snt.size());
+ for (size_t i = 0; i < snt.size(); ++i)
+ {
+ tsa_t::tree_iterator m(&I);
+ dest[i].clear();
+ for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
+ {
+ if (m.approxOccurrenceCount() < 3) break;
+ // if (k - i > 0) break;
+ sptr<SinglePhrase>& o = cache[m.getPid()];
+ if (!o)
+ {
+ o.reset(new SinglePhrase());
+ o->pid = m.getPid();
+ getoccs(m,o->occs);
+ }
+ dest[i].push_back(o);
+ }
+ }
+}
+
+struct
+RowIndexSorter
+{
+ vector<vector<float> > const& M;
+ size_t const my_col;
+ RowIndexSorter(vector<vector<float> > const& m, size_t const c)
+ : M(m), my_col(c) { }
+
+ template<typename T>
+ bool
+ operator()(T const& a, T const& b) const
+ {
+ return M.at(a).at(my_col) > M.at(b).at(my_col);
+ }
+};
+
+struct
+ColIndexSorter
+{
+ vector<vector<float> > const& M;
+ size_t const my_row;
+ ColIndexSorter(vector<vector<float> > const& m, size_t const r)
+ : M(m), my_row(r) { }
+
+ template<typename T>
+ bool
+ operator()(T const& a, T const& b) const
+ {
+ return M.at(my_row).at(a) > M[my_row].at(b);
+ }
+
+};
-Mmsapt* PT;
int main(int argc, char* argv[])
{
string base = argv[1];
string L1 = argv[2];
string L2 = argv[3];
- ostringstream buf;
- buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base="
- << base << " L1=" << L1 << " L2=" << L2;
- string configline = buf.str();
- PT = new Mmsapt(configline);
- PT->Load();
- float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856, 0.647506 };
- vector<float> weights(w,w+5);
- PT->setWeights(weights);
- // these values are taken from a moses.ini file;
- // is there a convenient way of accessing them from within mmsapt ???
- string eline,fline;
- // TokenIndex V; V.open("crp/trn/mm/de.tdx");
- while (getline(cin,eline) && getline(cin,fline))
+
+ T1.reset(new ttrack_t());
+ T2.reset(new ttrack_t());
+
+ V1.open(base + L1 + ".tdx");
+ T1->open(base + L1 + ".mct");
+ I1.open(base + L1 + ".sfa", T1);
+
+ V2.open(base + L2 + ".tdx");
+ T2->open(base + L2 + ".mct");
+ I2.open(base + L2 + ".sfa", T2);
+
+ tsa_t::tree_iterator m1(&I1);
+ tsa_t::tree_iterator m2(&I1);
+ string line1, line2;
+ while (getline(cin,line1) and getline(cin,line2))
{
- cout << eline << endl;
- cout << fline << endl;
- PT->align(eline,fline);
+ cout << "\n" << line1 << "\n" << line2 << endl;
+ vector<vector<sptr<SinglePhrase> > > M1,M2;
+ vector<id_type> snt1,snt2;
+ V1.fillIdSeq(line1,snt1);
+ V2.fillIdSeq(line2,snt2);
+ lookup_phrases(snt1,V1,*T1,I1,cache1,M1);
+ lookup_phrases(snt2,V2,*T2,I2,cache2,M2);
+
+ vector<PhrasePair> pp_all,pp_good;
+ vector<int> a1(snt1.size(),-1);
+ vector<int> a2(snt2.size(),-1);
+
+ vector<vector<int> > z1(snt1.size(),vector<int>(snt1.size(),-1));
+ vector<vector<int> > z2(snt2.size(),vector<int>(snt2.size(),-1));
+ vector<vector<vector<PhrasePair> > >ppm1(M1.size()),ppm2(M2.size());
+ vector<vector<float> > M(snt1.size(), vector<float>(snt2.size(),0));
+ vector<vector<size_t> > best1(snt1.size()), best2(snt2.size());
+ for (size_t i1 = 0; i1 < M1.size(); ++i1)
+ {
+ PhrasePair pp;
+ pp.s1 = i1;
+ ppm1[i1].resize(M1[i1].size());
+ for (size_t i2 = 0; i2 < M2.size(); ++i2)
+ {
+ pp.s2 = i2;
+ pp.stats.j = 1;
+ ppm2[i2].resize(M2[i2].size());
+ for (size_t k1 = 0; k1 < M1[i1].size(); ++k1)
+ {
+ pp.e1 = i1 + k1 + 1;
+ // if (pp.stats.j == 0) break;
+ for (size_t k2 = 0; k2 < M2[i2].size(); ++k2)
+ {
+ pp.e2 = i2 + k2 + 1;
+ sptr<PhrasePair::stats_t> & s
+ = ppcache[make_pair(M1[i1][k1]->pid,M2[i2][k2]->pid)];
+ if (!s)
+ {
+ s.reset(new PhrasePair::stats_t());
+ s->set(M1[i1][k1]->occs,M2[i2][k2]->occs,T1->size());
+ }
+ pp.stats = *s;
+ if (pp.stats.j == 0) break;
+ // ppm1[i1][k1].push_back(pp);
+ // ppm2[i2][k2].push_back(pp);
+ size_t J = pp.stats.j * 100;
+ if (pp.stats.score > 0
+ && J >= pp.stats.m1
+ && J > pp.stats.m2)
+ { pp_all.push_back(pp); }
+ }
+ }
+ }
+ }
+ sort(pp_all.begin(),pp_all.end());
+#if 0
+ BOOST_FOREACH(PhrasePair const& pp,pp_all)
+ {
+ if (pp.stats.npmi < 0) continue;
+ for (size_t r = pp.s1; r < pp.e1; ++r)
+ for (size_t c = pp.s2; c < pp.e2; ++c)
+ {
+ // M[r][c] += log(1-pp.stats.npmi);
+ M[r][c] += log(1-pp.stats.mi);
+ }
+ }
+ for (size_t r = 0; r < M.size(); ++r)
+ for (size_t c = 0; c < M[r].size(); ++c)
+ M[r][c] = 1.-exp(M[r][c]);
+ for (size_t r = 0; r < best1.size(); ++r)
+ {
+ best1[r].resize(snt2.size());
+ for (size_t c = 0; c < best1[r].size(); ++c)
+ best1[r][c] = c;
+ sort(best1[r].begin(),best1[r].end(),ColIndexSorter(M,r));
+ }
+ for (size_t c = 0; c < best2.size(); ++c)
+ {
+ best2[c].resize(snt1.size());
+ for (size_t r = 0; r < best2[c].size(); ++r)
+ best2[c][r] = r;
+ sort(best2[c].begin(),best2[c].end(),RowIndexSorter(M,c));
+ }
+ for (size_t r = 0; r < best1.size(); ++r)
+ {
+ cout << V1[snt1[r]] << ":";
+ for (size_t i = 0; i < min(3UL,M[r].size()); ++i)
+ {
+ size_t k = best1[r][i];
+ // if (M[r][k] >= M[best2[k][min(2UL,M.size())]][k])
+ cout << " " << k << ":" << V2[snt2[k]] << " " << M[r][k];
+ }
+ cout << endl;
+ }
+#endif
+#if 0
+ for (size_t k = 1; k < pp_all.size(); ++k)
+ for (size_t i = k; i--;)
+ if (pp_all[i].s1 >= pp_all[k].s1 &&
+ pp_all[i].e1 <= pp_all[k].e1 &&
+ pp_all[i].s2 >= pp_all[k].s2 &&
+ pp_all[i].e2 <= pp_all[k].e2)
+ pp_all[i].stats.score += pp_all[k].stats.score;
+ sort(pp_all.begin(),pp_all.end());
+#endif
+
+#if 1
+ vector<int> assoc1(snt1.size(),-1), assoc2(snt2.size(),-1);
+ for (size_t p = 0; p < pp_all.size(); ++p)
+ {
+ PhrasePair const& x = pp_all[p];
+ // if (x.stats.npmi < .7) break;
+ // if (z1[x.s1][x.e1-1] >= 0 || z2[x.s2][x.e2-1] >=0)
+ // continue;
+ for (size_t i = x.s1; i < x.e1; ++i)
+ {
+ if (assoc1[i] < 0)
+ assoc1[i] = p;
+ else
+ {
+ // PhrasePair& y = pp_all[assoc1[i]];
+ // if (y.includes(x))
+ // assoc1[i] = p;
+ }
+ }
+ for (size_t i = x.s2; i < x.e2; ++i)
+ {
+ if (assoc2[i] < 0)
+ assoc2[i] = p;
+ else
+ {
+ // PhrasePair& y = pp_all[assoc2[i]];
+ // if (y.includes(x))
+ // assoc2[i] = p;
+ }
+ }
+ z1[x.s1][x.e1-1] = p;
+ z2[x.s2][x.e2-1] = p;
+ continue;
+ cout << (boost::format("%.4f %.8f %.4f")
+ % x.stats.score
+ % x.stats.mi
+ % x.stats.npmi);
+ for (size_t z = x.s1; z < x.e1; ++z)
+ cout << " " << V1[snt1[z]];
+ cout << " :::";
+ for (size_t z = x.s2; z < x.e2; ++z)
+ cout << " " << V2[snt2[z]];
+ cout << " ["
+ << x.stats.m1 << "/" << x.stats.j << "/" << x.stats.m2
+ << "]" << endl;
+ }
+ vector<bool> done(pp_all.size(),false);
+ for (size_t i = 0; i < snt1.size(); ++i)
+ {
+ if (assoc1[i] < 0 || done[assoc1[i]])
+ continue;
+ // for (size_t k = 0; k < snt2.size(); ++k)
+ // if (assoc1[i] == assoc2[k])
+ {
+ done[assoc1[i]] = true;
+ PhrasePair& p = pp_all[assoc1[i]];
+ for (size_t j = p.s1; j < p.e1; ++j)
+ cout << j << ":" << V1[snt1[j]] << " ";
+ cout << " ::: ";
+ for (size_t j = p.s2; j < p.e2; ++j)
+ cout << j << ":" << V2[snt2[j]] << " ";
+ cout << "["
+ << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2
+ << "] "<< p.stats.score << endl;
+ // break;
+ }
+ }
+ cout << endl;
+ for (size_t i = 0; i < snt2.size(); ++i)
+ {
+ if (assoc2[i] < 0 || done[assoc2[i]])
+ continue;
+ done[assoc2[i]] = true;
+ PhrasePair& p = pp_all[assoc2[i]];
+ for (size_t j = p.s1; j < p.e1; ++j)
+ cout << j << ":" << V1[snt1[j]] << " ";
+ cout << " ::: ";
+ for (size_t j = p.s2; j < p.e2; ++j)
+ cout << j << ":" << V2[snt2[j]] << " ";
+ cout << "["
+ << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2
+ << "] "<< p.stats.score << endl;
+ }
+#endif
+ // sort(pp_all.begin(),pp_all.end());
+ // BOOST_FOREACH(PhrasePair const& pp, pp_all)
+ // {
+ // while (ppm1[pp.s1].size() < pp.e1 - pp.s1)
+ // ppm1[pp.s1].push_back(vector<PhrasePair>());
+ // vector<PhrasePair>& v1 = ppm1[pp.s1][pp.e1-pp.s1-1];
+ // if (v1.size() && v1[0].stats.score > pp.stats.score)
+ // continue;
+ // while (ppm2[pp.s2].size() < pp.e2 - pp.s2)
+ // ppm2[pp.s2].push_back(vector<PhrasePair>());
+ // vector<PhrasePair>& v2 = ppm2[pp.s2][pp.e2-pp.s2-1];
+ // if (v2.size() && v2[0].stats.score > pp.stats.score)
+ // continue;
+ // v1.push_back(pp);
+ // v2.push_back(pp);
+ // }
+
+
+ // BOOST_FOREACH(vector<vector<PhrasePair> >& vv, ppm1)
+ // {
+ // BOOST_FOREACH(vector<PhrasePair>& v, vv)
+ // {
+ // sort(v.begin(),v.end());
+ // if (v.size() > 1 && v[0].stats.score == v[1].stats.score)
+ // v.clear();
+ // }
+ // }
+ // for (size_t i2 = 0; i2 < ppm2.size(); ++i2)
+ // {
+ // for (size_t k2 = 0; k2 < ppm2[i2].size(); ++k2)
+ // {
+ // vector<PhrasePair>& v2 = ppm2[i2][k2];
+ // sort(v2.begin(),v2.end());
+ // if (v2.size() > 1 && v2[0].stats.score == v2[1].stats.score)
+ // {
+ // v2.clear();
+ // continue;
+ // }
+ // ushort i1 = v2[0].s1;
+ // ushort k1 = v2[0].e1 - i1 -1;
+
+ // if (ppm1[i1][k1].size() == 0 ||
+ // ppm1[i1][k1][0].s2 != i2 ||
+ // ppm1[i1][k1][0].e2 != i2 + k2 + 1)
+ // { v2.clear(); }
+ // else pp_good.push_back(ppm2[i2][k2][0]);
+ // }
+ // }
+ // BOOST_FOREACH(PhrasePair const& pp, pp_good)
+ // {
+ // cout << pp.stats.mi << " ";
+ // for (size_t z = pp.s1; z < pp.e1; ++z)
+ // cout << V1[snt1[z]] << " ";
+ // cout << " ::: ";
+ // for (size_t z = pp.s2; z < pp.e2; ++z)
+ // cout << V2[snt2[z]] << " ";
+ // cout << pp.stats.m1 << "/" << pp.stats.j << "/" << pp.stats.m2 << endl;
+ // }
+ // // cout << string(80,'=') << endl;
+ // // sort(pp_all.begin(),pp_all.end());
+ // // BOOST_FOREACH(PhrasePair const& pp, pp_all)
+ // // {
+ // // cout << pp.mi << " ";
+ // // for (size_t z = pp.s1; z < pp.e1; ++z)
+ // // cout << V1[snt1[z]] << " ";
+ // // cout << " ::: ";
+ // // for (size_t z = pp.s2; z < pp.e2; ++z)
+ // // cout << V2[snt2[z]] << " ";
+ // // cout << pp.m1 << "/" << pp.j << "/" << pp.m2 << endl;
+ // // }
+
}
- delete PT;
}
diff --git a/moses/TranslationModel/UG/try-align2.cc b/moses/TranslationModel/UG/try-align2.cc
new file mode 100644
index 000000000..57cf25035
--- /dev/null
+++ b/moses/TranslationModel/UG/try-align2.cc
@@ -0,0 +1,886 @@
+#include "mm/ug_bitext.h"
+#include <boost/format.hpp>
+// #include <unicode/stringpiece.h>
+#include <unicode/translit.h>
+#include <unicode/utypes.h>
+#include <unicode/unistr.h>
+#include <unicode/uchar.h>
+#include <unicode/utf8.h>
+#include "moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h"
+
+using namespace std;
+using namespace Moses;
+using namespace ugdiss;
+using namespace Moses::bitext;
+
+typedef L2R_Token<SimpleWordId> Token;
+typedef mmTtrack<Token> ttrack_t;
+typedef mmTSA<Token> tsa_t;
+typedef vector<Moses::bitext::PhrasePair<Token> > pplist_t;
+typedef pair<ushort,ushort> span_t;
+
+TokenIndex V1,V2;
+boost::shared_ptr<ttrack_t> T1,T2;
+tsa_t I1,I2;
+mmBitext<Token> BT;
+
+float lbop_level = .05;
+#define smooth 1
+namespace stats
+{
+ using namespace Moses::bitext;
+ float
+ pmi(size_t j,size_t m1, size_t m2, size_t N)
+ {
+#if smooth
+ float p1 = lbop(N,m1,lbop_level);
+ float p2 = lbop(N,m2,lbop_level);
+ float p12 = lbop(N,j,lbop_level);
+ return log(p12) - log(p1) - log(p2);
+#else
+ return log(j) + log(N) - log(m1) - log(m2);
+#endif
+ }
+
+ float
+ npmi(size_t j,size_t m1, size_t m2, size_t N)
+ {
+#if smooth
+ // cout << j << " " << m1 << " " << m2 << " " << N << endl;
+ float p1 = lbop(N,m1,lbop_level);
+ float p2 = lbop(N,m2,lbop_level);
+ float p12 = lbop(N,j,lbop_level);
+ return (log(p12) - log(p1) - log(p2)) / -log(p12);
+#else
+ return pmi(j,m1,m2,N) / (log(N) - log(j));
+#endif
+ }
+
+ float
+ mi(size_t j,size_t m1, size_t m2, size_t N)
+ {
+ float ret = 0;
+ if (j) ret += float(j)/N * pmi(j,m1,m2,N);
+ if (m1>j) ret += float(m1-j)/N * pmi(m1-j,m1,N-m2,N);
+ if (m2>j) ret += float(m2-j)/N * pmi(m2-j,N-m1,m2,N);
+ if (N>m1+m2-j) ret += float(N-m1-m2+j)/N * pmi(N-m1-m2+j,N-m1,N-m2,N);
+ return ret;
+ }
+}
+
+struct SinglePhrase
+{
+ typedef map<uint64_t,sptr<SinglePhrase> > cache_t;
+ uint64_t pid; // phrase id
+ vector<ttrack::Position> occs; // occurrences
+};
+
+
+struct PhrasePair2
+{
+ struct score_t;
+ uint64_t p1,p2;
+ ushort s1,e1,s2,e2;
+ int parent;
+
+ struct stats_t
+ {
+ typedef map<pair<uint64_t,uint64_t>, sptr<stats_t> > cache_t;
+ size_t m1,m2,j;
+ float npmi; // normalized point-wise mutual information
+ float pmi; // point-wise mutual information
+ float mi; // mutual information
+ float score;
+
+ void
+ set(vector<ttrack::Position> const& o1,
+ vector<ttrack::Position> const& o2,
+ size_t const N)
+ {
+ m1 = m2 = j = 0;
+ size_t i1=0,i2=0;
+ while (i1 < o1.size() && i2 < o2.size())
+ {
+ if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; }
+ if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; }
+
+ if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; }
+ else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; }
+ else { ++i2; ++m2; }
+ }
+ // for (++i1; i1 < o1.size(); ++i1)
+ // if (o1[i1-1].sid != o1[i1].sid) ++m1;
+ // for (++i2; i2 < o2.size(); ++i2)
+ // if (o2[i2-1].sid != o2[i2].sid) ++m2;
+
+ m1 = 1; m2 = 1;
+ for (i1=1; i1 < o1.size(); ++i1)
+ if (o1[i1-1].sid != o1[i1].sid) ++m1;
+ for (i2=1; i2 < o2.size(); ++i2)
+ if (o2[i2-1].sid != o2[i2].sid) ++m2;
+
+ this->mi = stats::mi(j,m1,m2,N);
+ this->pmi = stats::pmi(j,m1,m2,N);
+ this->npmi = stats::npmi(j,m1,m2,N);
+ // float z = float(m1)/N * float(m2)/N;
+ // float hmean = 2.*j/(m1+m2);
+ this->score = npmi; // npmi; // hmean; // /sqrt(z);
+ }
+ } stats;
+
+ PhrasePair2(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0)
+ : s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { }
+
+
+ bool
+ operator<(PhrasePair2 const& other) const
+ {
+ return (this->stats.score == other.stats.score
+ ? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2)
+ : (this->stats.score > other.stats.score));
+ }
+
+ size_t len1() const { return e1 - s1; }
+ size_t len2() const { return e2 - s2; }
+ bool includes(PhrasePair2 const& o) const
+ {
+ return s1 <= o.s1 && e1 >= o.e1 && s2 <= o.s2 && e2 >= o.e2;
+ }
+
+};
+
+SinglePhrase::cache_t cache1,cache2;
+PhrasePair2::stats_t::cache_t ppcache;
+
+
+struct SortByPositionInCorpus
+{
+ bool
+ operator()(ttrack::Position const& a,
+ ttrack::Position const& b) const
+ {
+ return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset;
+ }
+};
+
+
+void
+getoccs(tsa_t::tree_iterator const& m,
+ vector<ttrack::Position>& occs)
+{
+ occs.clear();
+ occs.reserve(m.approxOccurrenceCount()+10);
+ tsa::ArrayEntry I(m.lower_bound(-1));
+ char const* stop = m.upper_bound(-1);
+ do {
+ m.root->readEntry(I.next,I);
+ occs.push_back(I);
+ } while (I.next != stop);
+ sort(occs.begin(),occs.end(),SortByPositionInCorpus());
+}
+
+void
+lookup_phrases(vector<id_type> const& snt,
+ TokenIndex& V, ttrack_t const& T,
+ tsa_t const& I, SinglePhrase::cache_t& cache,
+ vector<vector<sptr<SinglePhrase> > >& dest)
+{
+ dest.resize(snt.size());
+ for (size_t i = 0; i < snt.size(); ++i)
+ {
+ tsa_t::tree_iterator m(&I);
+ dest[i].clear();
+ for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
+ {
+ if (m.approxOccurrenceCount() < 3) break;
+ // if (k - i > 0) break;
+ sptr<SinglePhrase>& o = cache[m.getPid()];
+ if (!o)
+ {
+ o.reset(new SinglePhrase());
+ o->pid = m.getPid();
+ getoccs(m,o->occs);
+ }
+ dest[i].push_back(o);
+ }
+ }
+}
+
+
+struct
+RowIndexSorter
+{
+ vector<vector<float> > const& M;
+ size_t const my_col;
+ RowIndexSorter(vector<vector<float> > const& m, size_t const c)
+ : M(m), my_col(c) { }
+
+ template<typename T>
+ bool
+ operator()(T const& a, T const& b) const
+ {
+ return M.at(a).at(my_col) > M.at(b).at(my_col);
+ }
+};
+
+struct
+ColIndexSorter
+{
+ vector<vector<float> > const& M;
+ size_t const my_row;
+ ColIndexSorter(vector<vector<float> > const& m, size_t const r)
+ : M(m), my_row(r) { }
+
+ template<typename T>
+ bool
+ operator()(T const& a, T const& b) const
+ {
+ return M.at(my_row).at(a) > M[my_row].at(b);
+ }
+
+};
+
+template<typename Token>
+class
+npmi_scorer1 : public Moses::bitext::PhrasePair<Token>::Scorer
+{
+public:
+ float operator()(PhrasePair<Token>& pp) const
+ {
+#if 0
+ cout << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " "
+ << pp.raw2 << " " << pp.sample2 << " " << pp.good2 << " "
+ << pp.joint << " " << __FILE__ << ":" << __LINE__ << endl;
+#endif
+ pp.good2 = ceil(pp.raw2 * float(pp.good1)/pp.raw1);
+ size_t N = ceil(BT.T1->numTokens() * float(pp.good1)/pp.raw1);
+ return pp.score = stats::npmi(pp.joint,pp.good1,pp.good2,N);
+ }
+};
+
+
+class Alnhyp
+{
+ ushort s1,s2,e1,e2;
+ float score;
+};
+
+
+size_t
+lcs(string const a, string const b)
+{
+ using namespace stringdist;
+ if (a == b) return a.size();
+ StringDiff diff(a,b);
+ size_t ret = 0;
+ size_t len = 0;
+ // size_t th = min(size_t(4),min(a.size(),b.size()));
+ for (size_t i = 0; i < diff.size(); ++i)
+ {
+ StringDiff::Segment const& s = diff[i];
+ if (s.match != StringDiff::same && s.match != StringDiff::cap)
+ {
+ if (len > ret) ret = len;
+ len = 0;
+ continue;
+ }
+ len += s.end_a - s.start_a;
+ }
+ if (len > ret) ret = len;
+ return ret;
+}
+
+size_t
+mapstring(string const& utf8,
+ UnicodeString& U,
+ vector<int>& c2w,
+ vector<int>* wlen=NULL)
+{
+ static UChar space = UnicodeString(" ")[0];
+ assert(utf8.at(0) != ' ');
+ U = UnicodeString(utf8.c_str()).toLower();
+ stringdist::strip_accents(U);
+ c2w.assign(U.length(),-1);
+ size_t k = 0;
+ size_t z = 0;
+ for (int i = 0; i < U.length(); ++i)
+ if (U[i] == space) { if (wlen) wlen->push_back(i-z); z = ++k; }
+ else c2w[i] = k;
+ assert(c2w.back() >= 0);
+ if (wlen) wlen->push_back(U.length()-z);
+ return k+1;
+}
+
+void
+align_letters(UnicodeString const& A, vector<int> const& a2p,
+ UnicodeString const& B, vector<int> const& b2p,
+ vector<vector<int> >& W)
+{
+ vector<vector<int> > M(A.length(),vector<int>(B.length(),0));
+ for (int a = 0; a < A.length(); ++a)
+ {
+ for (int b = 0; b < B.length(); ++b)
+ {
+ if (A[a] != B[b] || a2p[a] < 0 || b2p[b] < 0)
+ continue;
+ M[a][b] = (a && b) ? M[a-1][b-1] + 1 : 1;
+ int& x = W[a2p[a]][b2p[b]];
+ x = max(x,M[a][b]);
+ }
+ }
+ // string astring; A.toUTF8String(astring);
+ // string bstring; B.toUTF8String(bstring);
+ // cout << astring << "\n" << bstring << endl;
+ // for (size_t r = 0; r < W.size(); ++r)
+ // {
+ // BOOST_FOREACH(int x, W[r]) cout << setw(3) << x;
+ // cout << endl;
+ // }
+}
+
+void
+map_back(vector<vector<int> > const& W,
+ vector<vector<int> > & X,
+ vector<uchar> const & aln)
+{
+ for (size_t i = 0; i < aln.size(); i += 2)
+ {
+ vector<int> const& w = W.at(aln[i+1]);
+ vector<int>& x = X.at(aln[i]);
+ assert(x.size() == w.size());
+ for (size_t k = 0; k < x.size(); ++k)
+ x[k] = max(w[k],x[k]);
+ }
+}
+
+
+void trymatch3(vector<PhrasePair<Token> > const& tcands,
+ UnicodeString const& T, size_t const tlen,
+ vector<int> const& t2p,
+ TokenIndex const& V2, vector<vector<int> >&X)
+{
+ BOOST_FOREACH(PhrasePair<Token> const& pp, tcands)
+ {
+ UnicodeString H; vector<int> h2p;
+ string hstring = toString(V2, pp.start2, pp.len2);
+ size_t hlen = mapstring(hstring,H,h2p);
+ vector<vector<int> > W(hlen,vector<int>(tlen,0));
+ align_letters(H, h2p, T, t2p, W);
+#if 0
+ string s; S.toUTF8String(s);
+ string h; H.toUTF8String(h);
+ string t; T.toUTF8String(t);
+ cout << s << endl << h << endl << t << endl;
+ cout << slen << " " << tlen << endl;
+ cout << "W: " << W.size() << " rows; " << W[0].size() << " cols" << endl;
+ cout << "X: " << X.size() << " rows; " << X[0].size() << " cols" << endl;
+ cout << "aln: ";
+ for (size_t a = 0; a < pp.aln.size(); a +=2)
+ cout << int(pp.aln[a]) << "-" << int(pp.aln[a+1]) << " ";
+ cout << endl;
+#endif
+ map_back(W,X,pp.aln);
+ }
+}
+
+void minmatch_filter(vector<vector<int> > & X,
+ vector<int> const& len1,
+ vector<int> const& len2)
+{
+ // compute marginals
+ vector<int> m1(X.size(),0), m2(X.at(0).size(),0);
+ for (size_t r = 0; r < X.size(); ++r)
+ for (size_t c = 0; c < X[r].size(); ++c)
+ {
+ if (X[r][c] == 0) continue;
+ m1[r] += X[r][c];
+ m2[c] += X[r][c];
+ }
+
+ bool sth_changed = true;
+ while (sth_changed)
+ {
+ sth_changed = false;
+ for (size_t r = 0; r < m1.size(); ++r)
+ {
+ if (m1[r] && m1[r] < max(2,min(5,len1[r]/2)))
+ {
+ sth_changed = true;
+ for (size_t c = 0; c < X[r].size(); ++c)
+ {
+ m2[c] -= X[r][c];
+ X[r][c] = 0;
+ }
+ m1[r] = 0;
+ }
+ }
+
+ for (size_t c = 0; c < m2.size(); ++c)
+ {
+ if (m2[c] && m2[c] < max(2,min(5,len2[c]/2)))
+ {
+ sth_changed = true;
+ for (size_t r = 0; r < m1.size(); ++r)
+ {
+ m1[r] -= X[r][c];
+ X[r][c] = 0;
+ }
+ m2[c] = 0;
+ }
+ }
+ }
+}
+
+
+void
+trymatch2(TokenIndex& V1, // source language vocab
+ TokenIndex& V2, // target language vocab
+ string const& source, // source phrase
+ string const& target, // observed target candidate
+ vector<PhrasePair<Token> > const* const tcands,
+ vector<vector<int> >& X) // destination alignment matrix
+ // tcands: translations for source
+{
+ UnicodeString S,T;
+ vector<int> t2p, s2p; // maps from character position in string to word pos.
+ vector<int> wlen_t, wlen_s; // individual word lengths
+ size_t slen = mapstring(source, S, s2p, &wlen_s);
+ size_t tlen = mapstring(target, T, t2p, &wlen_t);
+
+ X.assign(slen,vector<int>(tlen,0));
+ if (slen == 1 && tlen ==1 && S == T)
+ X[0][0] = S.length();
+ else
+ {
+ align_letters(S,s2p,T,t2p,X);
+ if (tcands) trymatch3(*tcands, T, tlen, t2p, V2, X);
+ }
+
+ minmatch_filter(X, wlen_s, wlen_t);
+ bool hit = false;
+ for (size_t r = 0; !hit && r < X.size(); ++r)
+ for (size_t c = 0; !hit && c < X[r].size(); ++c)
+ hit = X[r][c] > min(S.length(),T.length())/2;
+
+ // if (hit)
+ // {
+ // cout << source << " ::: " << target;
+ // for (size_t r = 0; r < X.size(); ++r)
+ // for (size_t c = 0; c < X[r].size(); ++c)
+ // cout << boost::format(" %u-%u:%d") % r % c % X[r][c];
+ // cout << endl;
+ // }
+}
+
+
+
+// float
+// trymatch(string const a, string const b,
+// vector<PhrasePair<Token> > const* atrans,
+// vector<PhrasePair<Token> > const* btrans)
+// {
+// if (a == b) return a.size();
+// float score = 0;
+// float bar = lcs(a,b);
+// // score = max(bar/min(a.size(),b.size()),score);
+// score = max(bar,score);
+// // cout << "\n[" << bar << "] " << a << " ::: " << b << endl;
+// if (atrans)
+// {
+// BOOST_FOREACH(PhrasePair<Token> const& pp, *atrans)
+// {
+// // if (!pp.aln.size()) continue;
+// ushort L = pp.aln[1], R = pp.aln[1];
+// for (size_t k = 3; k < pp.aln.size(); k += 2)
+// {
+// if (L > pp.aln[k]) L = pp.aln[k];
+// if (R < pp.aln[k]) R = pp.aln[k];
+// }
+// if (L || R+1U < pp.len2) continue;
+// string foo = toString(*BT.V2,pp.start2,pp.len2);
+// // float bar = float(lcs(foo,b))/min(foo.size(),b.size());
+// float bar = float(lcs(foo,b));
+
+// if (bar > .5)
+// {
+// // score = max(pp.score * bar,score);
+// score = max(bar,score);
+// // cout << "[" << bar << "] " << foo << " ::: " << b
+// // << " (" << a << ") " << pp.score << endl;
+// }
+// }
+// }
+// if (btrans)
+// {
+// BOOST_FOREACH(PhrasePair<Token> const& pp, *btrans)
+// {
+// // if (!pp.aln.size()) continue;
+// ushort L = pp.aln[1], R = pp.aln[1];
+// for (size_t k = 3; k < pp.aln.size(); k += 2)
+// {
+// if (L > pp.aln[k]) L = pp.aln[k];
+// if (R < pp.aln[k]) R = pp.aln[k];
+// }
+// if (L || R+1U < pp.len2) continue;
+// string foo = toString(*BT.V1,pp.start2,pp.len2);
+// // float bar = float(lcs(a,foo))/min(a.size(),foo.size());
+// float bar = float(lcs(a,foo));
+// if (bar > .5)
+// {
+// score = max(bar,score);
+// // cout << "[" << bar<< "] " << a << " ::: " << foo
+// // << " (" << b << ") " << pp.score << endl;
+// }
+// }
+// }
+// return score;
+// }
+
+struct ahyp
+{
+ ushort s1,s2,e1,e2;
+ float score;
+ bool operator<(ahyp const& o) const { return score < o.score; }
+ bool operator>(ahyp const& o) const { return score > o.score; }
+};
+
+struct AlnPoint
+{
+ enum status { no = 0, yes = 1, maybe = -1, undef = -7 };
+ float score;
+ status state;
+ AlnPoint() : score(0), state(undef) {}
+};
+
+bool overlap(span_t const& a, span_t const& b)
+{
+ return !(a.second <= b.first || b.second <= a.first);
+}
+
+class AlnMatrix
+{
+ vector<bitvector> A1,A2; // final alignment matrix
+ vector<bitvector> S1,S2; // shadow alignment matrix
+public:
+ vector<bitvector*> m1,m2; // margins
+ AlnMatrix(size_t const rows, size_t const cols);
+ bitvector const&
+ operator[](size_t const r) const
+ { return A1.at(r); }
+
+ bool
+ incorporate(span_t const& rspan, span_t const& cspan,
+ vector<uchar> const& aln, bool const flip);
+
+ size_t size() const { return A1.size(); }
+};
+
+AlnMatrix::
+AlnMatrix(size_t const rows, size_t const cols)
+{
+ A1.assign(rows,bitvector(cols));
+ S1.assign(rows,bitvector(cols));
+ A2.assign(cols,bitvector(rows));
+ S2.assign(cols,bitvector(rows));
+ m1.assign(rows,NULL);
+ m2.assign(cols,NULL);
+}
+
+bool
+AlnMatrix::
+incorporate(span_t const& rspan,
+ span_t const& cspan,
+ vector<uchar> const& aln,
+ bool const flip)
+{
+ for (size_t r = rspan.first; r < rspan.second; ++r)
+ S1[r].reset();
+ for (size_t c = cspan.first; c < cspan.second; ++c)
+ S2[c].reset();
+ if (flip)
+ {
+ for (size_t i = 0; i < aln.size(); i += 2)
+ {
+ size_t r = rspan.first + aln[i];
+ size_t c = cspan.first + aln[i+1];
+ S1[r].set(c);
+ S2[c].set(r);
+ }
+ }
+ else
+ {
+ for (size_t i = 0; i < aln.size(); i += 2)
+ {
+ size_t r = rspan.first + aln[i+1];
+ size_t c = cspan.first + aln[i];
+ S1[r].set(c);
+ S2[c].set(r);
+ }
+ }
+ // check compatibility with existing alignment
+ for (size_t r = rspan.first; r < rspan.second; ++r)
+ if (m1[r] && (*m1[r]) != S1[r]) return false;
+ for (size_t c = cspan.first; c < cspan.second; ++c)
+ if (m2[c] && (*m2[c]) != S2[c]) return false;
+
+ // all good, add new points
+ for (size_t r = rspan.first; r < rspan.second; ++r)
+ if (!m1[r]) { A1[r] = S1[r]; m1[r] = &A1[r]; }
+ for (size_t c = cspan.first; c < cspan.second; ++c)
+ if (!m2[c]) { A2[c] = S2[c]; m2[c] = &A2[c]; }
+
+ return true;
+}
+
+struct alink
+{
+ size_t r,c,m;
+ bool operator<(alink const& o) const { return m < o.m; }
+ bool operator>(alink const& o) const { return m > o.m; }
+};
+
+int main(int argc, char* argv[])
+{
+ string base = argc > 1 ? argv[1] : "crp/trn/mm/";
+ string L1 = argc > 1 ? argv[2] : "de";
+ string L2 = argc > 1 ? argv[3] : "en";
+ BT.open(base,L1,L2);
+ BT.V1->setDynamic(true);
+ BT.V2->setDynamic(true);
+ string line1, line2;
+ npmi_scorer1<Token> scorer;
+ while (getline(cin,line1) and getline(cin,line2))
+ {
+ cout << "\n" << line1 << "\n" << line2 << endl;
+ vector<Token> snt1,snt2;
+ fill_token_seq(*BT.V1,line1,snt1);
+ fill_token_seq(*BT.V2,line2,snt2);
+ vector<vector<sptr<vector<PhrasePair<Token> > > > > pt1,pt2;
+ vector<vector<uint64_t> > pm1,pm2;
+ BT.lookup(snt1,*BT.I1,pt1,&pm1,&scorer);
+ BT.lookup(snt2,*BT.I2,pt2,&pm2,&scorer);
+
+ // build map from phrases to positions
+ typedef boost::unordered_map<uint64_t, vector<span_t> >
+ p2s_map_t;
+ typedef p2s_map_t::iterator p2s_iter;
+ p2s_map_t p2s1,p2s2;
+ for (ushort i = 0; i < pm1.size(); ++i)
+ for (ushort k = 0; k < pm1[i].size(); ++k)
+ p2s1[pm1[i][k]].push_back(make_pair(i,i+k+1));
+ for (ushort i = 0; i < pm2.size(); ++i)
+ for (ushort k = 0; k < pm2[i].size(); ++k)
+ p2s2[pm2[i][k]].push_back(make_pair(i,i+k+1));
+
+ boost::unordered_map<uint64_t,sptr<vector<PhrasePair<Token> > > > all1,all2;
+ vector<PhrasePair<Token> > pp_all;
+ for (size_t i = 0; i < pt2.size(); ++i)
+ for (size_t k = 0; k < pt2[i].size(); ++k)
+ all2[pm2[i][k]] = pt2[i][k];
+ for (size_t i = 0; i < pt1.size(); ++i)
+ for (size_t k = 0; k < pt1[i].size(); ++k)
+ {
+ all1[pm1[i][k]] = pt1[i][k];
+ BOOST_FOREACH(PhrasePair<Token> const& pp, *pt1[i][k])
+ {
+ if (pp.score < 0) break;
+ if (p2s2.find(pp.p2) != p2s2.end())
+ pp_all.push_back(pp);
+ }
+ }
+ sort(pp_all.begin(), pp_all.end(), greater<PhrasePair<Token> >());
+ vector<int> a1(snt1.size(),-1), a2(snt2.size(),-1);
+
+ vector<bitvector> R(snt1.size(),bitvector(snt2.size()));
+ vector<bitvector> C(snt2.size(),bitvector(snt1.size()));
+ vector<bitvector> myR(snt1.size(),bitvector(snt2.size()));
+ vector<bitvector> myC(snt2.size(),bitvector(snt1.size()));
+ vector<bitvector*> m1(snt1.size(),NULL);
+ vector<bitvector*> m2(snt2.size(),NULL);
+
+ // vector<vector<AlnPoint> > M(snt1.size(),vector<AlnPoint>(snt2.size()));
+ AlnMatrix A(snt1.size(),snt2.size());
+ for (size_t p = 0; p < pp_all.size(); ++p)
+ {
+ PhrasePair<Token> const& pp = pp_all[p];
+#if 0
+ cout << (boost::format("%30s ::: %-30s ")
+ % BT.toString(pp.p1,0).c_str()
+ % BT.toString(pp.p2,1).c_str());
+ cout << (boost::format("%.4f [%d/%d/%d]")
+ % pp.score % pp.good1 % pp.joint % pp.good2);
+ for (size_t a = 0; a < pp.aln.size(); a += 2)
+ cout << " " << int(pp.aln[a]) << "-" << int(pp.aln[a+1]);
+ cout << endl;
+#endif
+
+ vector<span_t>& v1 = p2s1[pp.p1];
+ vector<span_t>& v2 = p2s2[pp.p2];
+ if (v1.size() == 1)
+ for (size_t i = v1[0].first; i < v1[0].second; ++i)
+ if (a1[i] < 0) a1[i] = p;
+ if (v2.size() == 1)
+ for (size_t i = v2[0].first; i < v2[0].second; ++i)
+ if (a2[i] < 0) a2[i] = p;
+
+ if (v1.size() == 1 && v2.size() == 1)
+ A.incorporate(v1[0],v2[0],pp.aln,pp.inverse);
+ }
+
+ for (size_t i = 0; i < A.size(); ++i)
+ {
+ cout << (*BT.V1)[snt1[i].id()] << ": ";
+ for (size_t k=A[i].find_first(); k < A[i].size(); k=A[i].find_next(k))
+ cout << boost::format(" %d:%s") % k % (*BT.V2)[snt2[k].id()];
+ cout << endl;
+ }
+
+
+
+ vector<PhrasePair<Token> > const* atrans, *btrans;
+ ahyp h;
+ vector<ahyp> hyps;
+ vector<vector<int> > L(snt1.size(),vector<int>(snt2.size(),0));
+ // L: matches by letter overlap
+
+ for (h.s1 = 0; h.s1 < a1.size(); ++h.s1)
+ {
+ if (a1[h.s1] >= 0) continue;
+ ostringstream buf1;
+ for (h.e1 = h.s1; h.e1 < a1.size() && a1[h.e1] < 0; ++h.e1)
+ {
+ if (h.e1 > h.s1)
+ {
+ if (pt1[h.s1].size() + h.s1 <= h.e1) break;
+ buf1 << " ";
+ }
+ buf1 << (*BT.V1)[snt1[h.e1].id()];
+ atrans = pt1[h.s1].size() ? pt1[h.s1].at(h.e1-h.s1).get() : NULL;
+ for (h.s2 = 0; h.s2 < a2.size(); ++h.s2)
+ {
+ ostringstream buf2;
+ if (a2[h.s2] >= 0) continue;
+ for (h.e2 = h.s2; h.e2 < a2.size() && a2[h.e2] < 0; ++h.e2)
+ {
+ if (h.e2 > h.s2)
+ {
+ if (pt2[h.s2].size() + h.s2 <= h.e2) break;
+ buf2 << " ";
+ }
+ buf2 << (*BT.V2)[snt2[h.e2].id()];
+ btrans = (pt2[h.s2].size()
+ ? pt2[h.s2].at(h.e2-h.s2).get()
+ : NULL);
+
+ vector<vector<int> > aln;
+ trymatch2(*BT.V1, *BT.V2, buf1.str(),buf2.str(),
+ atrans,aln);
+ for (size_t i = 0; i < aln.size(); ++i)
+ for (size_t k = 0; k < aln[i].size(); ++k)
+ L[h.s1+i][h.s2+k] = max(L[h.s1+i][h.s2+k],aln[i][k]);
+ trymatch2(*BT.V2, *BT.V1, buf2.str(),buf1.str(),
+ btrans,aln);
+ for (size_t i = 0; i < aln[0].size(); ++i)
+ for (size_t k = 0; k < aln.size(); ++k)
+ L[h.s1+i][h.s2+k] = max(L[h.s1+i][h.s2+k],aln[k][i]);
+ // h.score = trymatch(buf1.str(), buf2.str(), atrans, btrans);
+ // hyps.push_back(h);
+ }
+ }
+ }
+ }
+
+ vector<alink> links;
+
+ alink x;
+ for (x.r = 0; x.r < L.size(); ++x.r)
+ {
+
+ for (x.c = 0; x.c < L[x.r].size(); ++x.c)
+ {
+ x.m = L[x.r][x.c];
+ if (x.m) links.push_back(x);
+ }
+ }
+
+ sort(links.begin(),links.end(),greater<alink>());
+
+ BOOST_FOREACH(alink& x, links)
+ {
+ if (L[x.r][x.c])
+ {
+ cout << (*BT.V1)[snt1[x.r].id()] << " ::: "
+ << (*BT.V2)[snt2[x.c].id()] << " ::: "
+ << L[x.r][x.c] << endl;
+ }
+ }
+
+ // sort(hyps.begin(),hyps.end(),greater<ahyp>());
+ // BOOST_FOREACH(ahyp const& h, hyps)
+ // {
+ // if (h.score < .5) break;
+ // for (size_t i = h.s1; i <= h.e1; ++i)
+ // cout << i << ":" << (*BT.V1)[snt1[i].id()] << " ";
+ // cout << " ::: ";
+ // for (size_t i = h.s2; i <= h.e2; ++i)
+ // cout << i << ":" << (*BT.V2)[snt2[i].id()] << " ";
+ // cout << h.score << endl;
+ // }
+
+ }
+}
+
+
+// for (size_t i = 0; i < pt1.size(); ++i)
+// {
+// for (size_t k = 0; k < pt1[i].size(); ++k)
+// {
+// size_t d1 = 0;
+// bool first = true;
+// BOOST_FOREACH(PhrasePair<Token> const& pt, *pt1[i][k])
+// {
+// TSA<Token>::tree_iterator m(BT.I2.get(),pt.start2,pt.len2);
+// if (pt.score < 0) break;
+// int left = pt.aln[1], right = pt.aln[1];
+// bool match = p2s2.find(m.getPid()) != p2s2.end();
+// if (!match)
+// {
+// for (size_t a = 3; a < pt.aln.size(); a += 2)
+// {
+// if (left > pt.aln[a]) left = pt.aln[a];
+// if (right < pt.aln[a]) right = pt.aln[a];
+// }
+// }
+// #if 0
+// if (match)
+// {
+// if (first)
+// {
+// cout << BT.toString(pm1[i][k],0) << endl;
+// first = false;
+// }
+// cout << boost::format("%.4f") % pt.score << " "
+// << setw(5) << d1 << " " << (match ? "* " : " ")
+// << toString(*BT.V2, pt.start2, pt.len2) << " ["
+// << pt.good1 << "/" << pt.joint << "/"
+// << pt.good2 << "]";
+// for (size_t a = 0; a < pt.aln.size(); a += 2)
+// cout << " " << int(pt.aln[a]) << "-" << int(pt.aln[a+1]);
+// cout << " [" << left << ":" << right << "]" << endl;
+// }
+// #endif
+// if (!match)
+// {
+// if (left == 0 && pt.len2 - right == 1)
+// d1 += pt.joint;
+// }
+// else
+// {
+// pp_all.push_back(pt);
+// // pp_all.back().m1 -= d1;
+// }
+
+// }
+// if (!first) cout << endl;
+// }
+
diff --git a/moses/TranslationModel/UG/util/Makefile b/moses/TranslationModel/UG/util/Makefile
new file mode 100644
index 000000000..afe8c7b86
--- /dev/null
+++ b/moses/TranslationModel/UG/util/Makefile
@@ -0,0 +1,7 @@
+# -*- makefile -*-
+
+MOSES_CODE=/fs/gna0/germann/code/mosesdecoder
+MOSES_ROOT=/fs/gna0/germann/moses
+LIBS = $(addprefix -l,moses icuuc icuio icui18n boost_iostreams)
+ibm1-align: ibm1-align.cc
+ g++ -o $@ -L ${MOSES_ROOT}/lib -I ${MOSES_CODE} $^ ${LIBS} -ggdb \ No newline at end of file
diff --git a/moses/TranslationModel/UG/util/ibm1-align b/moses/TranslationModel/UG/util/ibm1-align
new file mode 100755
index 000000000..2352dadb9
--- /dev/null
+++ b/moses/TranslationModel/UG/util/ibm1-align
Binary files differ
diff --git a/moses/TranslationModel/UG/util/ibm1-align.cc b/moses/TranslationModel/UG/util/ibm1-align.cc
new file mode 100644
index 000000000..08ac1f89b
--- /dev/null
+++ b/moses/TranslationModel/UG/util/ibm1-align.cc
@@ -0,0 +1,164 @@
+// -*- c++ -*-
+// Parallel text alignment via IBM1 / raw counts of word alignments
+// aiming at high precision (to seed Yawat alignments)
+// This program is tailored for use with Yawat.
+// Written by Ulrich Germann.
+
+#include <string>
+
+#include <unicode/stringpiece.h>
+#include <unicode/utypes.h>
+#include <unicode/unistr.h>
+#include <unicode/uchar.h>
+#include <unicode/utf8.h>
+
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
+#include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
+#include <boost/unordered_map.hpp>
+#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
+#include "moses/TranslationModel/UG/mm/ug_mm_2d_table.h"
+
+using namespace std;
+using namespace ugdiss;
+
+typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
+
+class IBM1
+{
+public:
+ table_t COOC;
+ TokenIndex V1,V2;
+
+ void
+ align(string const& s1, string const& s2, vector<int>& aln) const;
+
+ void
+ align(vector<id_type> const& x1,
+ vector<id_type> const& x2,
+ vector<int>& aln) const;
+
+ void
+ fill_amatrix(vector<id_type> const& x1,
+ vector<id_type> const& x2,
+ vector<vector<int> >& aln) const;
+
+ void
+ open(string const base, string const L1, string const L2);
+};
+
+void
+IBM1::
+open(string const base, string const L1, string const L2)
+{
+ V1.open(base+L1+".tdx");
+ V2.open(base+L2+".tdx");
+ COOC.open(base+L1+"-"+L2+".lex");
+}
+
+void
+IBM1::
+align(string const& s1, string const& s2, vector<int>& aln) const
+{
+ vector<id_type> x1,x2;
+ V1.fillIdSeq(s1,x1);
+ V2.fillIdSeq(s2,x2);
+ align(x1,x2,aln);
+}
+
+ static UnicodeString apos = UnicodeString::fromUTF8(StringPiece("'"));
+
+string
+u(StringPiece str, size_t start, size_t stop)
+{
+ string ret;
+ UnicodeString::fromUTF8(str).tempSubString(start,stop).toUTF8String(ret);
+ return ret;
+}
+
+void
+IBM1::
+fill_amatrix(vector<id_type> const& x1,
+ vector<id_type> const& x2,
+ vector<vector<int> >& aln) const
+{
+ aln.assign(x1.size(),vector<int>(x2.size()));
+ for (size_t i = 0; i < x1.size(); ++i)
+ for (size_t k = 0; k < x2.size(); ++k)
+ aln[i][k] = COOC[x1[i]][x2[k]];
+#if 0
+ cout << setw(10) << " ";
+ for (size_t k = 0; k < x2.size(); ++k)
+ cout << setw(7) << right << u(V2[x2[k]],0,6);
+ cout << endl;
+ for (size_t i = 0; i < x1.size(); ++i)
+ {
+ cout << setw(10) << u(V1[x1[i]],0,10);
+ for (size_t k = 0; k < x2.size(); ++k)
+ {
+ if (aln[i][k] > 999999)
+ cout << setw(7) << aln[i][k]/1000 << " K";
+ else
+ cout << setw(7) << aln[i][k];
+ }
+ cout << endl;
+ }
+#endif
+}
+
+
+void
+IBM1::
+align(vector<id_type> const& x1,
+ vector<id_type> const& x2,
+ vector<int>& aln) const
+{
+ vector<vector<int> > M;
+ // fill_amatrix(x1,x2,M);
+ vector<int> i1(x1.size(),0), max1(x1.size(),0);
+ vector<int> i2(x2.size(),0), max2(x2.size(),0);
+ aln.clear();
+ for (size_t i = 0; i < i1.size(); ++i)
+ {
+ for (size_t k = 0; k < i2.size(); ++k)
+ {
+ int c = COOC[x1[i]][x2[k]];
+ if (c > max1[i]) { i1[i] = k; max1[i] = c; }
+ if (c >= max2[k]) { i2[k] = i; max2[k] = c; }
+ }
+ }
+ for (size_t i = 0; i < i1.size(); ++i)
+ {
+ if (max1[i] && i2[i1[i]] == i)
+ {
+ aln.push_back(i);
+ aln.push_back(i1[i]);
+ }
+ }
+}
+
+int main(int argc, char* argv[])
+{
+ IBM1 ibm1;
+ ibm1.open(argv[1],argv[2],argv[3]);
+ string line1,line2,sid;
+ while (getline(cin,sid))
+ {
+ if (!getline(cin,line1)) assert(false);
+ if (!getline(cin,line2)) assert(false);
+ vector<int> a;
+ vector<id_type> s1,s2;
+ ibm1.V1.fillIdSeq(line1,s1);
+ ibm1.V2.fillIdSeq(line2,s2);
+ ibm1.align(s1,s2,a);
+ cout << sid;
+ for (size_t i = 0; i < a.size(); i += 2)
+ cout << " " << a[i] << ":" << a[i+1] << ":unspec";
+ cout << endl;
+ // cout << line1 << endl;
+ // cout << line2 << endl;
+ // for (size_t i = 0; i < a.size(); i += 2)
+ // cout << ibm1.V1[s1[a[i]]] << " - "
+ // << ibm1.V2[s2[a[i+1]]] << endl;
+ }
+ // cout << endl;
+}
diff --git a/moses/TranslationModel/UG/util/tokenindex.dump.cc b/moses/TranslationModel/UG/util/tokenindex.dump.cc
new file mode 100644
index 000000000..8ab68579d
--- /dev/null
+++ b/moses/TranslationModel/UG/util/tokenindex.dump.cc
@@ -0,0 +1,31 @@
+// (c) 2007,2008 Ulrich Germann
+// Licensed to NRC-CNRC under special agreement.
+
+/**
+ * @author Ulrich Germann
+ * @file tokenindex.dump.cc
+ * @brief Dumps a TokenIndex (vocab file for TPPT and TPLM) to stdout.
+ */
+
+#include "../mm/tpt_tokenindex.h"
+#include <iostream>
+#include <iomanip>
+
+using namespace std;
+using namespace ugdiss;
+int
+main(int argc,char* argv[])
+{
+ if (argc > 1 && !strcmp(argv[1], "-h")) {
+ printf("Usage: %s <file>\n\n", argv[0]);
+ cout << "Converts a phrase table in text format to a phrase table in tighly packed format." << endl;
+ cout << "input file: token index file" << endl;
+ exit(1);
+ }
+
+ TokenIndex I;
+ I.open(argv[1]);
+ vector<char const*> foo = I.reverseIndex();
+ for (size_t i = 0; i < foo.size(); i++)
+ cout << setw(10) << i << " " << foo[i] << endl;
+}
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index fc68e1f0d..26dce03d0 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -323,17 +323,10 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
// do not try to find the best ... report multiple matches
if (multiple_flag) {
- int input_letter_length = compute_length( input[sentenceInd] );
for(int si=0; si<best_tm.size(); si++) {
int s = best_tm[si];
string path;
- unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
- // do not report multiple identical sentences, but just their count
- //cout << sentenceInd << " "; // sentence number
- //cout << letter_cost << "/" << input_letter_length << " ";
- //cout << "(" << best_cost <<"/" << input_length <<") ";
- //cout << "||| " << s << " ||| " << path << endl;
-
+ sed( input[sentenceInd], source[s], path, true );
const vector<WORD_ID> &sourceSentence = source[s];
vector<SentenceAlignment> &targets = targetAndAlignment[s];
create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
@@ -345,10 +338,10 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
// find the best matches according to letter sed
string best_path = "";
int best_match = -1;
- int best_letter_cost;
+ unsigned int best_letter_cost;
if (lsed_flag) {
best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
- for(int si=0; si<best_tm.size(); si++) {
+ for(size_t si=0; si<best_tm.size(); si++) {
int s = best_tm[si];
string path;
unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
@@ -413,11 +406,9 @@ void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector
istream *fileStreamP = &fileStream;
- char line[LINE_MAX_LENGTH];
- while(true) {
- SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
- if (fileStreamP->eof()) break;
- corpus.push_back( GetVocabulary().Tokenize( line ) );
+ string line;
+ while(getline(*fileStreamP, line)) {
+ corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) );
}
}
@@ -436,12 +427,9 @@ void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector<
WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
int lineNum = 0;
- char line[LINE_MAX_LENGTH];
- while(true) {
- SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
- if (fileStreamP->eof()) break;
-
- vector<WORD_ID> toks = GetVocabulary().Tokenize( line );
+ string line;
+ while(getline(*fileStreamP, line)) {
+ vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() );
corpus.push_back(vector< SentenceAlignment >());
vector< SentenceAlignment > &vec = corpus.back();
@@ -493,11 +481,8 @@ void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vect
string delimiter = "|||";
int lineNum = 0;
- char line[LINE_MAX_LENGTH];
- while(true) {
- SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
- if (fileStreamP->eof()) break;
-
+ string line;
+ while(getline(*fileStreamP, line)) {
vector< SentenceAlignment > &vec = corpus[lineNum];
size_t targetInd = 0;
SentenceAlignment *sentence = &vec[targetInd];
@@ -715,7 +700,7 @@ unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentenc
/* brute force method: compare input to all corpus sentences */
-int FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
+void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
vector< vector< WORD_ID > > input )
{
// go through input set...
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
index 6405ae566..da50b64b9 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
@@ -60,8 +60,8 @@ protected:
void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
/** brute force method: compare input to all corpus sentences */
- int basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
- std::vector< std::vector< tmmt::WORD_ID > > input ) ;
+ void basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
+ std::vector< std::vector< tmmt::WORD_ID > > input ) ;
/** utlility function: compute length of sentence in characters
(spaces do not count) */
diff --git a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
index 536bff741..2930147ab 100644
--- a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
+++ b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
@@ -14,17 +14,16 @@ SuffixArray::SuffixArray( string fileName )
m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
ifstream extractFile;
- char line[LINE_MAX_LENGTH];
// count the number of words first;
extractFile.open(fileName.c_str());
istream *fileP = &extractFile;
m_size = 0;
size_t sentenceCount = 0;
- while(!fileP->eof()) {
- SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
- if (fileP->eof()) break;
- vector< WORD_ID > words = m_vcb.Tokenize( line );
+ string line;
+ while(getline(*fileP, line)) {
+
+ vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
m_size += words.size() + 1;
sentenceCount++;
}
@@ -43,10 +42,8 @@ SuffixArray::SuffixArray( string fileName )
int sentenceId = 0;
extractFile.open(fileName.c_str());
fileP = &extractFile;
- while(!fileP->eof()) {
- SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
- if (fileP->eof()) break;
- vector< WORD_ID > words = m_vcb.Tokenize( line );
+ while(getline(*fileP, line)) {
+ vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
// add to corpus vector
corpus.push_back(words);
diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.h b/moses/TranslationModel/fuzzy-match/Vocabulary.h
index dfa11c1db..5a79e2f26 100644
--- a/moses/TranslationModel/fuzzy-match/Vocabulary.h
+++ b/moses/TranslationModel/fuzzy-match/Vocabulary.h
@@ -17,20 +17,6 @@
namespace tmmt
{
-
-#define MAX_LENGTH 10000
-
-#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
- _IS.getline(_LINE, _SIZE, _DELIM); \
- if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
- if (_IS.gcount() == _SIZE-1) { \
- cerr << "Line too long! Buffer overflow. Delete lines >=" \
- << _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
- << endl; \
- exit(1); \
- } \
- }
-
typedef std::string WORD;
typedef unsigned int WORD_ID;
diff --git a/moses/TranslationModel/fuzzy-match/create_xml.cpp b/moses/TranslationModel/fuzzy-match/create_xml.cpp
index 44c1efc9f..a8b6a52cf 100644
--- a/moses/TranslationModel/fuzzy-match/create_xml.cpp
+++ b/moses/TranslationModel/fuzzy-match/create_xml.cpp
@@ -31,8 +31,8 @@ void create_xml(const string &inPath)
ofstream rule((inPath + ".extract").c_str());
ofstream ruleInv((inPath + ".extract.inv").c_str());
- int setenceId;
- float score;
+ // int setenceId;
+ // float score;
string source, target, align, path;
string *input = NULL;
int count;
@@ -47,11 +47,11 @@ void create_xml(const string &inPath)
//cout << inLine << endl;
switch (step) {
case 0:
- setenceId = Scan<int>(inLine);
+ /*setenceId = */ Scan<int>(inLine);
++step;
break;
case 1:
- score = Scan<float>(inLine);
+ /*score = */ Scan<float>(inLine);
++step;
break;
case 2:
@@ -124,7 +124,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
int start_s = 0, start_i = 0;
//cerr << input << endl << source << endl << target << endl << path << endl;
- for ( int p = 0 ; p < path.length() ; p++ ) {
+ for ( int p = 0 ; p < int(path.length()) ; p++ ) {
string action = path.substr(p, 1);
// beginning of a mismatch
@@ -176,7 +176,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
}
// end of sentence? add to end
- if ( start_t == 1000 && i > inputToks.size() - 1 ) {
+ if ( start_t == 1000 && i > int(inputToks.size()) - 1 ) {
start_t = targetsToks.size() - 1;
}
@@ -216,13 +216,13 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
if ( action != "I" ) {
//cerr << " ->";
- if (s < alignments.m_alignS2T.size()) {
+ if (s < int(alignments.m_alignS2T.size())) {
const std::map<int, int> &targets = alignments.m_alignS2T[s];
//cerr << "s=" << s << endl;
std::map<int, int>::const_iterator iter;
for (iter = targets.begin(); iter != targets.end(); ++iter) {
- int tt = iter->first;
+ // int tt = iter->first;
//cerr << " " << tt;
}
}
@@ -245,7 +245,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
} // for ( int p = 0
//cerr << target << endl;
- for (int i = 0; i < targetBitmap.size(); ++i) {
+ for (size_t i = 0; i < targetBitmap.size(); ++i) {
//cerr << targetBitmap[i];
}
//cerr << endl;
@@ -260,13 +260,13 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
int rule_pos_s = 0;
map<int, int> ruleAlignS;
- for (int i = 0 ; i < inputBitmap.size() ; ++i ) {
+ for (int i = 0 ; i < int(inputBitmap.size()) ; ++i ) {
if ( inputBitmap[i] ) {
ret.ruleS += inputToks[i] + " ";
ruleAlignS[ alignI2S[i] ] = rule_pos_s++;
}
- for (int j = 0; j < nonTerms.size(); ++j) {
+ for (size_t j = 0; j < nonTerms.size(); ++j) {
map<string, int> &nt = nonTerms[j];
if (i == nt["start_i"]) {
ret.ruleS += "[X][X] ";
@@ -284,7 +284,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
ruleAlignT[t] = rule_pos_t++;
}
- for (int i = 0; i < nonTerms.size(); ++i) {
+ for (size_t i = 0; i < nonTerms.size(); ++i) {
map<string, int> &nt = nonTerms[i];
if (t == nt["start_t"]) {
@@ -300,7 +300,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
for (map<int, int>::const_iterator iter = ruleAlignS.begin(); iter != ruleAlignS.end(); ++iter) {
int s = iter->first;
- if (s < alignments.m_alignS2T.size()) {
+ if (s < int(alignments.m_alignS2T.size())) {
const std::map<int, int> &targets = alignments.m_alignS2T[s];
std::map<int, int>::const_iterator iter;
@@ -316,7 +316,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
//cerr << "numAlign=" << numAlign << endl;
- for (int i = 0; i < nonTerms.size(); ++i) {
+ for (size_t i = 0; i < nonTerms.size(); ++i) {
map<string, int> &nt = nonTerms[i];
ret.ruleAlignment += SPrint(nt["rule_pos_s"]) + "-" + SPrint(nt["rule_pos_t"]) + " ";
++numAlign;
@@ -329,7 +329,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
ret.ruleAlignment = TrimInternal(ret.ruleAlignment);
vector<string> ruleAlignmentToks = Tokenize(ret.ruleAlignment);
- for (int i = 0; i < ruleAlignmentToks.size(); ++i) {
+ for (size_t i = 0; i < ruleAlignmentToks.size(); ++i) {
const string &alignPoint = ruleAlignmentToks[i];
vector<string> toks = Tokenize(alignPoint, "-");
assert(toks.size() == 2);
@@ -338,7 +338,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
ret.ruleAlignmentInv = TrimInternal(ret.ruleAlignmentInv);
// frame
- ret.frame;
+ // ret.frame;
if (frameInput.find(-1) == frameInput.end())
ret.frame = frameInput[-1];
@@ -346,7 +346,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
int start_t = -1;
targetBitmap.push_back(0);
- for (int t = 0 ; t <= targetsToks.size() ; t++ ) {
+ for (size_t t = 0 ; t <= targetsToks.size() ; t++ ) {
// beginning of tm target inclusion
if ( !currently_included && targetBitmap[t] ) {
start_t = t;
@@ -360,7 +360,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
if ( start_t >= 0 ) {
string target = "";
//cerr << "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
- for (int tt = start_t ; tt < t + targetBitmap[t] ; tt++ ) {
+ for (size_t tt = start_t ; tt < t + targetBitmap[t] ; tt++ ) {
target += targetsToks[tt] + " ";
}
// target = Trim(target); TODO
diff --git a/moses/TranslationOption.cpp b/moses/TranslationOption.cpp
index d6ce2ff9a..ecd9e7f0e 100644
--- a/moses/TranslationOption.cpp
+++ b/moses/TranslationOption.cpp
@@ -32,7 +32,7 @@ namespace Moses
{
TranslationOption::TranslationOption()
- :m_targetPhrase()
+ :m_targetPhrase(NULL)
,m_inputPath(NULL)
,m_sourceWordsRange(NOT_FOUND, NOT_FOUND)
{
@@ -71,10 +71,10 @@ void TranslationOption::CacheLexReorderingScores(const LexicalReordering &produc
m_lexReorderingScores[&producer] = score;
}
-void TranslationOption::Evaluate(const InputType &input)
+void TranslationOption::EvaluateWithSourceContext(const InputType &input)
{
const InputPath &inputPath = GetInputPath();
- m_targetPhrase.Evaluate(input, inputPath);
+ m_targetPhrase.EvaluateWithSourceContext(input, inputPath);
}
const InputPath &TranslationOption::GetInputPath() const
diff --git a/moses/TranslationOption.h b/moses/TranslationOption.h
index 3bc1797bb..9d2e10780 100644
--- a/moses/TranslationOption.h
+++ b/moses/TranslationOption.h
@@ -135,7 +135,7 @@ public:
return m_targetPhrase.GetScoreBreakdown();
}
- void Evaluate(const InputType &input);
+ void EvaluateWithSourceContext(const InputType &input);
/** returns cached scores */
inline const Scores *GetLexReorderingScores(const LexicalReordering *scoreProducer) const {
diff --git a/moses/TranslationOptionCollection.cpp b/moses/TranslationOptionCollection.cpp
index 73aec47e9..19690ed40 100644
--- a/moses/TranslationOptionCollection.cpp
+++ b/moses/TranslationOptionCollection.cpp
@@ -212,6 +212,12 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPa
float unknownScore = FloorScore(TransformScore(0));
const Word &sourceWord = inputPath.GetPhrase().GetWord(0);
+ // hack. Once the OOV FF is a phrase table, get rid of this
+ PhraseDictionary *firstPt = NULL;
+ if (PhraseDictionary::GetColl().size() == 0) {
+ firstPt = PhraseDictionary::GetColl()[0];
+ }
+
// unknown word, add as trans opt
FactorCollection &factorCollection = FactorCollection::Instance();
@@ -231,7 +237,7 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPa
// modify the starting bitmap
}
- TargetPhrase targetPhrase;
+ TargetPhrase targetPhrase(firstPt);
if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit) {
// add to dictionary
@@ -266,7 +272,7 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPa
m_unksrcs.push_back(&sourcePhrase);
WordsRange range(sourcePos, sourcePos + length - 1);
- targetPhrase.Evaluate(sourcePhrase);
+ targetPhrase.EvaluateInIsolation(sourcePhrase);
TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
transOpt->SetInputPath(inputPath);
@@ -416,7 +422,7 @@ void TranslationOptionCollection::CreateTranslationOptions()
ProcessUnknownWord();
- EvaluateWithSource();
+ EvaluateWithSourceContext();
// Prune
Prune();
@@ -546,7 +552,7 @@ void TranslationOptionCollection::SetInputScore(const InputPath &inputPath, Part
}
}
-void TranslationOptionCollection::EvaluateWithSource()
+void TranslationOptionCollection::EvaluateWithSourceContext()
{
const size_t size = m_source.GetSize();
for (size_t startPos = 0 ; startPos < size ; ++startPos) {
@@ -560,7 +566,7 @@ void TranslationOptionCollection::EvaluateWithSource()
TranslationOptionList::const_iterator iterTransOpt;
for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
TranslationOption &transOpt = **iterTransOpt;
- transOpt.Evaluate(m_source);
+ transOpt.EvaluateWithSourceContext(m_source);
}
}
}
diff --git a/moses/TranslationOptionCollection.h b/moses/TranslationOptionCollection.h
index a311cee03..90cba2185 100644
--- a/moses/TranslationOptionCollection.h
+++ b/moses/TranslationOptionCollection.h
@@ -96,7 +96,7 @@ protected:
//! implemented by inherited class, called by this class
virtual void ProcessUnknownWord(size_t sourcePos)=0;
- void EvaluateWithSource();
+ void EvaluateWithSourceContext();
void CacheLexReordering();
diff --git a/moses/TranslationOptionCollectionLattice.cpp b/moses/TranslationOptionCollectionLattice.cpp
index a7838feaf..349aa385c 100644
--- a/moses/TranslationOptionCollectionLattice.cpp
+++ b/moses/TranslationOptionCollectionLattice.cpp
@@ -24,7 +24,7 @@ TranslationOptionCollectionLattice::TranslationOptionCollectionLattice(
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
{
UTIL_THROW_IF2(StaticData::Instance().GetUseLegacyPT(),
- "Not for models using the legqacy binary phrase table");
+ "Not for models using the legqacy binary phrase table");
const InputFeature &inputFeature = InputFeature::Instance();
UTIL_THROW_IF2(&inputFeature == NULL, "Input feature must be specified");
@@ -48,7 +48,7 @@ TranslationOptionCollectionLattice::TranslationOptionCollectionLattice(
WordsRange range(startPos, endPos);
if (range.GetNumWordsCovered() > maxPhraseLength) {
- continue;
+ continue;
}
const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos);
@@ -73,53 +73,53 @@ TranslationOptionCollectionLattice::TranslationOptionCollectionLattice(
void TranslationOptionCollectionLattice::Extend(const InputPath &prevPath, const WordLattice &input)
{
- size_t nextPos = prevPath.GetWordsRange().GetEndPos() + 1;
- if (nextPos >= input.GetSize()) {
- return;
- }
+ size_t nextPos = prevPath.GetWordsRange().GetEndPos() + 1;
+ if (nextPos >= input.GetSize()) {
+ return;
+ }
- size_t startPos = prevPath.GetWordsRange().GetStartPos();
- const Phrase &prevPhrase = prevPath.GetPhrase();
- const ScorePair *prevInputScore = prevPath.GetInputScore();
- UTIL_THROW_IF2(prevInputScore == NULL,
- "Null previous score");
+ size_t startPos = prevPath.GetWordsRange().GetStartPos();
+ const Phrase &prevPhrase = prevPath.GetPhrase();
+ const ScorePair *prevInputScore = prevPath.GetInputScore();
+ UTIL_THROW_IF2(prevInputScore == NULL,
+ "Null previous score");
- const std::vector<size_t> &nextNodes = input.GetNextNodes(nextPos);
+ const std::vector<size_t> &nextNodes = input.GetNextNodes(nextPos);
- const ConfusionNet::Column &col = input.GetColumn(nextPos);
- for (size_t i = 0; i < col.size(); ++i) {
- const Word &word = col[i].first;
- UTIL_THROW_IF2(word.IsEpsilon(), "Epsilon not supported");
+ const ConfusionNet::Column &col = input.GetColumn(nextPos);
+ for (size_t i = 0; i < col.size(); ++i) {
+ const Word &word = col[i].first;
+ UTIL_THROW_IF2(word.IsEpsilon(), "Epsilon not supported");
- size_t nextNode = nextNodes[i];
- size_t endPos = nextPos + nextNode - 1;
+ size_t nextNode = nextNodes[i];
+ size_t endPos = nextPos + nextNode - 1;
- WordsRange range(startPos, endPos);
+ WordsRange range(startPos, endPos);
- size_t maxPhraseLength = StaticData::Instance().GetMaxPhraseLength();
- if (range.GetNumWordsCovered() > maxPhraseLength) {
- continue;
- }
+ size_t maxPhraseLength = StaticData::Instance().GetMaxPhraseLength();
+ if (range.GetNumWordsCovered() > maxPhraseLength) {
+ continue;
+ }
- const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos);
+ const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos);
- Phrase subphrase(prevPhrase);
- subphrase.AddWord(word);
+ Phrase subphrase(prevPhrase);
+ subphrase.AddWord(word);
- const ScorePair &scores = col[i].second;
- ScorePair *inputScore = new ScorePair(*prevInputScore);
- inputScore->PlusEquals(scores);
+ const ScorePair &scores = col[i].second;
+ ScorePair *inputScore = new ScorePair(*prevInputScore);
+ inputScore->PlusEquals(scores);
- InputPath *path = new InputPath(subphrase, labels, range, &prevPath, inputScore);
+ InputPath *path = new InputPath(subphrase, labels, range, &prevPath, inputScore);
- path->SetNextNode(nextNode);
- m_inputPathQueue.push_back(path);
+ path->SetNextNode(nextNode);
+ m_inputPathQueue.push_back(path);
- // recursive
- Extend(*path, input);
+ // recursive
+ Extend(*path, input);
- }
+ }
}
void TranslationOptionCollectionLattice::CreateTranslationOptions()
@@ -142,21 +142,19 @@ void TranslationOptionCollectionLattice::CreateTranslationOptions()
const WordsRange &range = path.GetWordsRange();
if (tpColl && tpColl->GetSize()) {
- TargetPhraseCollection::const_iterator iter;
- for (iter = tpColl->begin(); iter != tpColl->end(); ++iter) {
- const TargetPhrase &tp = **iter;
- TranslationOption *transOpt = new TranslationOption(range, tp);
- transOpt->SetInputPath(path);
- transOpt->Evaluate(m_source);
-
- Add(transOpt);
- }
- } else if (path.GetPhrase().GetSize() == 1) {
- // unknown word processing
- ProcessOneUnknownWord(path, path.GetWordsRange().GetEndPos(), 1, path.GetInputScore());
- } else if (path.GetPhrase().GetSize() == 1) {
- // unknown word processing
- ProcessOneUnknownWord(path, path.GetWordsRange().GetStartPos(), path.GetWordsRange().GetNumWordsCovered() , path.GetInputScore());
+ TargetPhraseCollection::const_iterator iter;
+ for (iter = tpColl->begin(); iter != tpColl->end(); ++iter) {
+ const TargetPhrase &tp = **iter;
+ TranslationOption *transOpt = new TranslationOption(range, tp);
+ transOpt->SetInputPath(path);
+ transOpt->EvaluateWithSourceContext(m_source);
+
+ Add(transOpt);
+ }
+ }
+ else if (path.GetPhrase().GetSize() == 1) {
+ // unknown word processing
+ ProcessOneUnknownWord(path, path.GetWordsRange().GetStartPos(), path.GetWordsRange().GetNumWordsCovered() , path.GetInputScore());
}
}
@@ -175,16 +173,16 @@ void TranslationOptionCollectionLattice::CreateTranslationOptions()
void TranslationOptionCollectionLattice::ProcessUnknownWord(size_t sourcePos)
{
- UTIL_THROW(util::Exception, "ProcessUnknownWord() not implemented for lattice");
+ UTIL_THROW(util::Exception, "ProcessUnknownWord() not implemented for lattice");
}
void TranslationOptionCollectionLattice::CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
- , size_t startPosition
- , size_t endPosition
- , bool adhereTableLimit
- , size_t graphInd)
+ , size_t startPosition
+ , size_t endPosition
+ , bool adhereTableLimit
+ , size_t graphInd)
{
- UTIL_THROW(util::Exception, "CreateTranslationOptionsForRange() not implemented for lattice");
+ UTIL_THROW(util::Exception, "CreateTranslationOptionsForRange() not implemented for lattice");
}
} // namespace
diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp
new file mode 100644
index 000000000..d8b448d44
--- /dev/null
+++ b/moses/TranslationTask.cpp
@@ -0,0 +1,318 @@
+#include "TranslationTask.h"
+#include "moses/StaticData.h"
+#include "moses/Sentence.h"
+#include "moses/IOWrapper.h"
+#include "moses/TranslationAnalysis.h"
+#include "moses/TypeDef.h"
+#include "moses/Util.h"
+#include "moses/InputType.h"
+#include "moses/OutputCollector.h"
+#include "moses/Incremental.h"
+#include "mbr.h"
+
+#include "moses/Syntax/S2T/Parsers/RecursiveCYKPlusParser/RecursiveCYKPlusParser.h"
+#include "moses/Syntax/S2T/Parsers/Scope3Parser/Parser.h"
+
+#include "util/exception.hh"
+
+using namespace std;
+
+namespace Moses
+{
+
+TranslationTask::TranslationTask(InputType* source, Moses::IOWrapper &ioWrapper, int pbOrChart)
+: m_source(source)
+, m_ioWrapper(ioWrapper)
+, m_pbOrChart(pbOrChart)
+{}
+
+TranslationTask::~TranslationTask() {
+ delete m_source;
+}
+
+void TranslationTask::Run()
+{
+ switch (m_pbOrChart)
+ {
+ case 1:
+ RunPb();
+ break;
+ case 2:
+ RunChart();
+ break;
+ default:
+ UTIL_THROW(util::Exception, "Unknown value: " << m_pbOrChart);
+ }
+}
+
+
+void TranslationTask::RunPb()
+{
+ // shorthand for "global data"
+ const StaticData &staticData = StaticData::Instance();
+
+ // input sentence
+ Sentence sentence;
+
+ // report wall time spent on translation
+ Timer translationTime;
+ translationTime.start();
+
+ // report thread number
+#if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
+ TRACE_ERR("Translating line " << m_source->GetTranslationId() << " in thread id " << pthread_self() << endl);
+#endif
+
+
+ // execute the translation
+ // note: this executes the search, resulting in a search graph
+ // we still need to apply the decision rule (MAP, MBR, ...)
+ Timer initTime;
+ initTime.start();
+ Manager manager(*m_source,staticData.GetSearchAlgorithm());
+ VERBOSE(1, "Line " << m_source->GetTranslationId() << ": Initialize search took " << initTime << " seconds total" << endl);
+ manager.Decode();
+
+ // we are done with search, let's look what we got
+ Timer additionalReportingTime;
+ additionalReportingTime.start();
+
+ // output word graph
+ manager.OutputWordGraph(m_ioWrapper.GetWordGraphCollector());
+
+ // output search graph
+ manager.OutputSearchGraph(m_ioWrapper.GetSearchGraphOutputCollector());
+
+ manager.OutputSearchGraphSLF();
+
+ // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
+ manager.OutputSearchGraphHypergraph();
+
+ additionalReportingTime.stop();
+
+ // apply decision rule and output best translation(s)
+ if (m_ioWrapper.GetSingleBestOutputCollector()) {
+ ostringstream out;
+ ostringstream debug;
+ FixPrecision(debug,PRECISION);
+
+ // all derivations - send them to debug stream
+ if (staticData.PrintAllDerivations()) {
+ additionalReportingTime.start();
+ manager.PrintAllDerivations(m_source->GetTranslationId(), debug);
+ additionalReportingTime.stop();
+ }
+
+ Timer decisionRuleTime;
+ decisionRuleTime.start();
+
+ // MAP decoding: best hypothesis
+ const Hypothesis* bestHypo = NULL;
+ if (!staticData.UseMBR()) {
+ bestHypo = manager.GetBestHypothesis();
+ if (bestHypo) {
+ if (StaticData::Instance().GetOutputHypoScore()) {
+ out << bestHypo->GetTotalScore() << ' ';
+ }
+ if (staticData.IsPathRecoveryEnabled()) {
+ m_ioWrapper.OutputInput(out, bestHypo);
+ out << "||| ";
+ }
+
+ const PARAM_VEC *params = staticData.GetParameter().GetParam("print-id");
+ if (params && params->size() && Scan<bool>(params->at(0)) ) {
+ out << m_source->GetTranslationId() << " ";
+ }
+
+ if (staticData.GetReportSegmentation() == 2) {
+ manager.GetOutputLanguageModelOrder(out, bestHypo);
+ }
+ m_ioWrapper.OutputBestSurface(
+ out,
+ bestHypo,
+ staticData.GetOutputFactorOrder(),
+ staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors());
+ if (staticData.PrintAlignmentInfo()) {
+ out << "||| ";
+ m_ioWrapper.OutputAlignment(out, bestHypo);
+ }
+
+ manager.OutputAlignment(m_ioWrapper.GetAlignmentInfoCollector());
+
+ IFVERBOSE(1) {
+ debug << "BEST TRANSLATION: " << *bestHypo << endl;
+ }
+ } else {
+ VERBOSE(1, "NO BEST TRANSLATION" << endl);
+ }
+
+ out << endl;
+ }
+
+ // MBR decoding (n-best MBR, lattice MBR, consensus)
+ else {
+ // we first need the n-best translations
+ size_t nBestSize = staticData.GetMBRSize();
+ if (nBestSize <= 0) {
+ cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl;
+ exit(1);
+ }
+ TrellisPathList nBestList;
+ manager.CalcNBest(nBestSize, nBestList,true);
+ VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl);
+ IFVERBOSE(2) {
+ PrintUserTime("calculated n-best list for (L)MBR decoding");
+ }
+
+ // lattice MBR
+ if (staticData.UseLatticeMBR()) {
+ if (m_ioWrapper.GetNBestOutputCollector()) {
+ //lattice mbr nbest
+ vector<LatticeMBRSolution> solutions;
+ size_t n = min(nBestSize, staticData.GetNBestSize());
+ getLatticeMBRNBest(manager,nBestList,solutions,n);
+ ostringstream out;
+ m_ioWrapper.OutputLatticeMBRNBest(out, solutions,m_source->GetTranslationId());
+ m_ioWrapper.GetNBestOutputCollector()->Write(m_source->GetTranslationId(), out.str());
+ } else {
+ //Lattice MBR decoding
+ vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
+ m_ioWrapper.OutputBestHypo(mbrBestHypo, m_source->GetTranslationId(), staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),out);
+ IFVERBOSE(2) {
+ PrintUserTime("finished Lattice MBR decoding");
+ }
+ }
+ }
+
+ // consensus decoding
+ else if (staticData.UseConsensusDecoding()) {
+ const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
+ m_ioWrapper.OutputBestHypo(conBestHypo, m_source->GetTranslationId(),
+ staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),out);
+ m_ioWrapper.OutputAlignment(m_ioWrapper.GetAlignmentInfoCollector(), m_source->GetTranslationId(), conBestHypo);
+ IFVERBOSE(2) {
+ PrintUserTime("finished Consensus decoding");
+ }
+ }
+
+ // n-best MBR decoding
+ else {
+ const TrellisPath &mbrBestHypo = doMBR(nBestList);
+ m_ioWrapper.OutputBestHypo(mbrBestHypo, m_source->GetTranslationId(),
+ staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),out);
+ m_ioWrapper.OutputAlignment(m_ioWrapper.GetAlignmentInfoCollector(), m_source->GetTranslationId(), mbrBestHypo);
+ IFVERBOSE(2) {
+ PrintUserTime("finished MBR decoding");
+ }
+ }
+ }
+
+ // report best translation to output collector
+ m_ioWrapper.GetSingleBestOutputCollector()->Write(m_source->GetTranslationId(),out.str(),debug.str());
+
+ decisionRuleTime.stop();
+ VERBOSE(1, "Line " << m_source->GetTranslationId() << ": Decision rule took " << decisionRuleTime << " seconds total" << endl);
+ }
+
+ additionalReportingTime.start();
+
+ // output n-best list
+ manager.OutputNBest(m_ioWrapper.GetNBestOutputCollector());
+
+ //lattice samples
+ manager.OutputLatticeSamples(m_ioWrapper.GetLatticeSamplesCollector());
+
+ // detailed translation reporting
+ manager.OutputDetailedTranslationReport(m_ioWrapper.GetDetailedTranslationCollector());
+
+ //list of unknown words
+ manager.OutputUnknowns(m_ioWrapper.GetUnknownsCollector());
+
+ // report additional statistics
+ manager.CalcDecoderStatistics();
+ VERBOSE(1, "Line " << m_source->GetTranslationId() << ": Additional reporting took " << additionalReportingTime << " seconds total" << endl);
+ VERBOSE(1, "Line " << m_source->GetTranslationId() << ": Translation took " << translationTime << " seconds total" << endl);
+ IFVERBOSE(2) {
+ PrintUserTime("Sentence Decoding Time:");
+ }
+}
+
+
+void TranslationTask::RunChart()
+{
+ const StaticData &staticData = StaticData::Instance();
+ const size_t translationId = m_source->GetTranslationId();
+
+ VERBOSE(2,"\nTRANSLATING(" << translationId << "): " << *m_source);
+
+ if (staticData.UseS2TDecoder()) {
+ S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
+ if (algorithm == RecursiveCYKPlus) {
+ typedef Syntax::S2T::EagerParserCallback Callback;
+ typedef Syntax::S2T::RecursiveCYKPlusParser<Callback> Parser;
+ DecodeS2T<Parser>();
+ } else if (algorithm == Scope3) {
+ typedef Syntax::S2T::StandardParserCallback Callback;
+ typedef Syntax::S2T::Scope3Parser<Callback> Parser;
+ DecodeS2T<Parser>();
+ } else {
+ UTIL_THROW2("ERROR: unhandled S2T parsing algorithm");
+ }
+ return;
+ }
+
+ if (staticData.GetSearchAlgorithm() == ChartIncremental) {
+ Incremental::Manager manager(*m_source);
+ manager.Decode();
+ const std::vector<search::Applied> &nbest = manager.GetNBest();
+ if (!nbest.empty()) {
+ m_ioWrapper.OutputBestHypo(nbest[0], translationId);
+
+ manager.OutputDetailedTranslationReport(m_ioWrapper.GetDetailedTranslationCollector());
+ manager.OutputDetailedTreeFragmentsTranslationReport(m_ioWrapper.GetDetailTreeFragmentsOutputCollector());
+
+ } else {
+ m_ioWrapper.OutputBestNone(translationId);
+ }
+
+ manager.OutputNBest(m_ioWrapper.GetNBestOutputCollector());
+
+ return;
+ }
+
+ ChartManager manager(*m_source);
+ manager.Decode();
+
+ UTIL_THROW_IF2(staticData.UseMBR(), "Cannot use MBR");
+
+ // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
+ manager.OutputSearchGraphHypergraph();
+
+ // 1-best
+ const ChartHypothesis *bestHypo = manager.GetBestHypothesis();
+ m_ioWrapper.OutputBestHypo(bestHypo, translationId);
+ IFVERBOSE(2) {
+ PrintUserTime("Best Hypothesis Generation Time:");
+ }
+
+ manager.OutputAlignment(m_ioWrapper.GetAlignmentInfoCollector());
+ manager.OutputDetailedTranslationReport(m_ioWrapper.GetDetailedTranslationCollector());
+ manager.OutputDetailedTreeFragmentsTranslationReport(m_ioWrapper.GetDetailTreeFragmentsOutputCollector());
+ manager.OutputUnknowns(m_ioWrapper.GetUnknownsCollector());
+
+ // n-best
+ manager.OutputNBest(m_ioWrapper.GetNBestOutputCollector());
+
+ manager.OutputSearchGraph(m_ioWrapper.GetSearchGraphOutputCollector());
+
+ IFVERBOSE(2) {
+ PrintUserTime("Sentence Decoding Time:");
+ }
+ manager.CalcDecoderStatistics();
+}
+
+}
diff --git a/moses/TranslationTask.h b/moses/TranslationTask.h
new file mode 100644
index 000000000..217ffce00
--- /dev/null
+++ b/moses/TranslationTask.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <boost/smart_ptr/shared_ptr.hpp>
+#include "moses/ThreadPool.h"
+#include "moses/Manager.h"
+#include "moses/HypergraphOutput.h"
+#include "moses/IOWrapper.h"
+#include "moses/Manager.h"
+#include "moses/ChartManager.h"
+
+#include "moses/Syntax/S2T/Manager.h"
+
+namespace Moses
+{
+class InputType;
+class OutputCollector;
+
+
+/** Translates a sentence.
+ * - calls the search (Manager)
+ * - applies the decision rule
+ * - outputs best translation and additional reporting
+ **/
+class TranslationTask : public Moses::Task
+{
+
+public:
+
+ TranslationTask(Moses::InputType* source, Moses::IOWrapper &ioWrapper, int pbOrChart);
+
+ ~TranslationTask();
+
+ /** Translate one sentence
+ * gets called by main function implemented at end of this source file */
+ void Run();
+
+
+private:
+ int m_pbOrChart; // 1=pb. 2=chart
+ Moses::InputType* m_source;
+ Moses::IOWrapper &m_ioWrapper;
+
+ void RunPb();
+ void RunChart();
+
+
+ template<typename Parser>
+ void DecodeS2T() {
+ const StaticData &staticData = StaticData::Instance();
+ const std::size_t translationId = m_source->GetTranslationId();
+ Syntax::S2T::Manager<Parser> manager(*m_source);
+ manager.Decode();
+ // 1-best
+ const Syntax::SHyperedge *best = manager.GetBestSHyperedge();
+ m_ioWrapper.OutputBestHypo(best, translationId);
+ // n-best
+ manager.OutputNBest(m_ioWrapper.GetNBestOutputCollector());
+
+ // Write 1-best derivation (-translation-details / -T option).
+
+ manager.OutputDetailedTranslationReport(m_ioWrapper.GetDetailedTranslationCollector());
+
+ manager.OutputUnknowns(m_ioWrapper.GetUnknownsCollector());
+ }
+
+};
+
+
+} //namespace
diff --git a/moses/TreeInput.cpp b/moses/TreeInput.cpp
index f7d3a7443..792522540 100644
--- a/moses/TreeInput.cpp
+++ b/moses/TreeInput.cpp
@@ -5,6 +5,7 @@
#include "Util.h"
#include "XmlOption.h"
#include "FactorCollection.h"
+#include "moses/TranslationModel/PhraseDictionary.h"
using namespace std;
@@ -30,6 +31,12 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
return true;
}
+ // hack. What pt should XML trans opt be assigned to?
+ PhraseDictionary *firstPt = NULL;
+ if (PhraseDictionary::GetColl().size() == 0) {
+ firstPt = PhraseDictionary::GetColl()[0];
+ }
+
// break up input into a vector of xml tags and text
// example: (this), (<b>), (is a), (</b>), (test .)
vector<string> xmlTokens = TokenizeXml(line);
@@ -145,8 +152,12 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
- if (startPos >= endPos) {
- TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
+ if (startPos == endPos) {
+ TRACE_ERR("WARNING: tag " << tagName << " span is empty. Ignoring: " << line << endl);
+ continue;
+ }
+ else if (startPos > endPos) {
+ TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl);
return false;
}
@@ -169,7 +180,7 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
//TRACE_ERR("number of translations: " << altTexts.size() << endl);
for (size_t i=0; i<altTexts.size(); ++i) {
// set target phrase
- TargetPhrase targetPhrase;
+ TargetPhrase targetPhrase(firstPt);
// targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);
@@ -199,7 +210,7 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
// convert from prob to log-prob
float scoreValue = FloorScore(TransformScore(probValue));
targetPhrase.SetXMLScore(scoreValue);
- targetPhrase.Evaluate(sourcePhrase);
+ targetPhrase.EvaluateInIsolation(sourcePhrase);
// set span and create XmlOption
WordsRange range(startPos+1,endPos);
@@ -266,7 +277,10 @@ int TreeInput::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
// default label
for (size_t startPos = 0; startPos < sourceSize; ++startPos) {
for (size_t endPos = startPos; endPos < sourceSize; ++endPos) {
- AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder);
+ NonTerminalSet &list = GetLabelSet(startPos, endPos);
+ if (list.size() == 0 || !staticData.GetDefaultNonTermOnlyForEmptyRange()) {
+ AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder);
+ }
}
}
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index ef1f52efb..7c9bc1b8c 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -31,6 +31,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <BaseTsd.h>
#else
#include <stdint.h>
+
typedef uint32_t UINT32;
typedef uint64_t UINT64;
#endif
@@ -59,7 +60,12 @@ const size_t DEFAULT_MAX_HYPOSTACK_SIZE = 200;
const size_t DEFAULT_MAX_TRANS_OPT_CACHE_SIZE = 10000;
const size_t DEFAULT_MAX_TRANS_OPT_SIZE = 5000;
const size_t DEFAULT_MAX_PART_TRANS_OPT_SIZE = 10000;
+//#ifdef PT_UG
+// setting to std::numeric_limits<size_t>::max() makes the regression test for (deprecated) PhraseDictionaryDynamicSuffixArray fail.
+// const size_t DEFAULT_MAX_PHRASE_LENGTH = 100000;
+//#else
const size_t DEFAULT_MAX_PHRASE_LENGTH = 20;
+//#endif
const size_t DEFAULT_MAX_CHART_SPAN = 10;
const size_t ARRAY_SIZE_INCR = 10; //amount by which a phrase gets resized when necessary
const float LOWEST_SCORE = -100.0f;
@@ -68,6 +74,9 @@ const float DEFAULT_EARLY_DISCARDING_THRESHOLD = 0.0f;
const float DEFAULT_TRANSLATION_OPTION_THRESHOLD = 0.0f;
const size_t DEFAULT_VERBOSE_LEVEL = 1;
+// output floats with five significant digits
+static const size_t PRECISION = 3;
+
// enums.
// must be 0, 1, 2, ..., unless otherwise stated
@@ -106,26 +115,6 @@ enum DistortionOrientationOptions {
};
}
-enum PhraseTableImplementation {
- Memory = 0
- ,Binary = 1
- ,OnDisk = 2
- //,GlueRule = 3
- //,Joshua = 4
- //,MemorySourceLabel = 5
- ,SCFG = 6
- //,BerkeleyDb = 7
- ,SuffixArray = 8
- ,Hiero = 9
- ,ALSuffixArray = 10
- ,FuzzyMatch = 11
- ,Compact = 12
- ,Interpolated = 13
- ,DSuffixArray = 14
- ,MemMappedSA = 15
- ,DCacheBased = 32
-};
-
enum InputTypeEnum {
SentenceInput = 0
,ConfusionNetworkInput = 1
@@ -173,6 +162,11 @@ enum FormatType {
,HieroFormat
};
+enum S2TParsingAlgorithm {
+ RecursiveCYKPlus,
+ Scope3
+};
+
// typedef
typedef size_t FactorType;
diff --git a/moses/Util.cpp b/moses/Util.cpp
index 79690668f..34d03cab8 100644
--- a/moses/Util.cpp
+++ b/moses/Util.cpp
@@ -37,6 +37,9 @@
#include "Timer.h"
#include "util/exception.hh"
#include "util/file.hh"
+#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/StaticData.h"
using namespace std;
@@ -337,6 +340,44 @@ std::string PassthroughSGML(std::string &line, const std::string tagName, const
return meta;
}
+void PrintFeatureWeight(const FeatureFunction* ff)
+{
+ cout << ff->GetScoreProducerDescription() << "=";
+ size_t numScoreComps = ff->GetNumScoreComponents();
+ vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+ for (size_t i = 0; i < numScoreComps; ++i) {
+ cout << " " << values[i];
+ }
+ cout << endl;
+
}
+void ShowWeights()
+{
+ FixPrecision(cout,6);
+ const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+
+ for (size_t i = 0; i < sff.size(); ++i) {
+ const StatefulFeatureFunction *ff = sff[i];
+ if (ff->IsTuneable()) {
+ PrintFeatureWeight(ff);
+ }
+ else {
+ cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
+ }
+ }
+ for (size_t i = 0; i < slf.size(); ++i) {
+ const StatelessFeatureFunction *ff = slf[i];
+ if (ff->IsTuneable()) {
+ PrintFeatureWeight(ff);
+ }
+ else {
+ cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
+ }
+ }
+}
+
+} // namespace
+
diff --git a/moses/Util.h b/moses/Util.h
index 9bdcf622f..11be058ae 100644
--- a/moses/Util.h
+++ b/moses/Util.h
@@ -56,8 +56,15 @@ namespace Moses
/** verbose macros
* */
-#define VERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR(str); } }
+
+#define VERBOSE(level,str) { IFVERBOSE(level) { TRACE_ERR(str); } }
#define IFVERBOSE(level) if (StaticData::Instance().GetVerboseLevel() >= level)
+#define XVERBOSE(level,str) VERBOSE(level, "[" << HERE << "] " << str)
+#define HERE __FILE__ << ":" << __LINE__
+#define FEATUREVERBOSE(level,str) FEATUREVERBOSE2(level, "[" << GetScoreProducerDescription() << "] " << str)
+#define FEATUREVERBOSE2(level,str) { IFFEATUREVERBOSE(level) { TRACE_ERR(str); } }
+#define IFFEATUREVERBOSE(level) if ((m_verbosity == std::numeric_limits<std::size_t>::max() && StaticData::Instance().GetVerboseLevel() >= level) || (m_verbosity != std::numeric_limits<std::size_t>::max() && m_verbosity >= level))
+
#if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2)
// gcc nth_element() bug
@@ -98,6 +105,52 @@ inline std::string Scan<std::string>(const std::string &input)
return input;
}
+template<>
+inline WordAlignmentSort Scan<WordAlignmentSort>(const std::string &input)
+{
+ return (WordAlignmentSort) Scan<size_t>(input);
+}
+
+template<>
+inline InputTypeEnum Scan<InputTypeEnum>(const std::string &input)
+{
+ return (InputTypeEnum) Scan<size_t>(input);
+}
+
+template<>
+inline SearchAlgorithm Scan<SearchAlgorithm>(const std::string &input)
+{
+ return (SearchAlgorithm) Scan<size_t>(input);
+}
+
+template<>
+inline S2TParsingAlgorithm Scan<S2TParsingAlgorithm>(const std::string &input)
+{
+ return (S2TParsingAlgorithm) Scan<size_t>(input);
+}
+
+template<>
+inline SourceLabelOverlap Scan<SourceLabelOverlap>(const std::string &input)
+{
+ return (SourceLabelOverlap) Scan<size_t>(input);
+}
+
+template<>
+inline XmlInputType Scan<XmlInputType>(const std::string &input)
+{
+ XmlInputType ret;
+ if (input=="exclusive") ret = XmlExclusive;
+ else if (input=="inclusive") ret = XmlInclusive;
+ else if (input=="constraint") ret = XmlConstraint;
+ else if (input=="ignore") ret = XmlIgnore;
+ else if (input=="pass-through") ret = XmlPassThrough;
+ else {
+ UTIL_THROW2("Unknown XML input type");
+ }
+
+ return ret;
+}
+
//! Specialisation to understand yes/no y/n true/false 0/1
template<>
bool Scan<bool>(const std::string &input);
@@ -430,7 +483,19 @@ T log_sum (T log_a, T log_b)
return ( v );
}
-
+/** Enforce rounding */
+inline void FixPrecision(std::ostream& stream, size_t size = 3)
+{
+ stream.setf(std::ios::fixed);
+ stream.precision(size);
}
+class FeatureFunction;
+
+void PrintFeatureWeight(const FeatureFunction* ff);
+void ShowWeights();
+
+
+} // namespace
+
#endif
diff --git a/moses/Word.cpp b/moses/Word.cpp
index 384f183d0..b1ea77059 100644
--- a/moses/Word.cpp
+++ b/moses/Word.cpp
@@ -34,15 +34,15 @@ using namespace std;
namespace Moses
{
-
-// utility function for factorless decoding
-size_t
-max_fax()
-{
- if (StaticData::Instance().GetFactorDelimiter().size())
- return MAX_NUM_FACTORS;
- return 1;
-}
+
+ // utility function for factorless decoding
+ size_t
+ max_fax()
+ {
+ if (StaticData::Instance().GetFactorDelimiter().size())
+ return MAX_NUM_FACTORS;
+ return 1;
+ }
// static
int Word::Compare(const Word &targetWord, const Word &sourceWord)
@@ -85,8 +85,8 @@ std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlan
unsigned int stop = min(max_fax(),factorType.size());
for (unsigned int i = 0 ; i < stop ; i++) {
UTIL_THROW_IF2(factorType[i] >= MAX_NUM_FACTORS,
- "Trying to reference factor " << factorType[i]
- << ". Max factor is " << MAX_NUM_FACTORS);
+ "Trying to reference factor " << factorType[i]
+ << ". Max factor is " << MAX_NUM_FACTORS);
const Factor *factor = m_factorArray[factorType[i]];
if (factor != NULL) {
@@ -109,46 +109,49 @@ StringPiece Word::GetString(FactorType factorType) const
class StrayFactorException : public util::Exception {};
-void
+void
Word::
CreateFromString(FactorDirection direction
- , const std::vector<FactorType> &factorOrder
- , const StringPiece &str
- , bool isNonTerminal
- , bool strict)
+ , const std::vector<FactorType> &factorOrder
+ , const StringPiece &str
+ , bool isNonTerminal
+ , bool strict)
{
FactorCollection &factorCollection = FactorCollection::Instance();
vector<StringPiece> bits(MAX_NUM_FACTORS);
string factorDelimiter = StaticData::Instance().GetFactorDelimiter();
- if (factorDelimiter.size()) {
- util::TokenIter<util::MultiCharacter> fit(str, factorDelimiter);
- size_t i = 0;
- for (; i < MAX_NUM_FACTORS && fit; ++i,++fit)
- bits[i] = *fit;
- if (i == MAX_NUM_FACTORS)
- UTIL_THROW_IF(fit, StrayFactorException,
- "The hard limit for factors is " << MAX_NUM_FACTORS
- << ". The word " << str << " contains factor delimiter "
- << StaticData::Instance().GetFactorDelimiter()
- << " too many times.");
- if (strict)
- UTIL_THROW_IF(fit, StrayFactorException,
- "You have configured " << factorOrder.size()
- << " factors but the word " << str
- << " contains factor delimiter "
- << StaticData::Instance().GetFactorDelimiter()
- << " too many times.");
-
- UTIL_THROW_IF(i < factorOrder.size(),util::Exception,
- "Too few factors in string '" << str << "'.");
- } else {
- bits[0] = str;
- }
- for (size_t k = 0; k < factorOrder.size(); ++k) {
- UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception,
- "Factor order out of bounds.");
- m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal);
- }
+ if (factorDelimiter.size())
+ {
+ util::TokenIter<util::MultiCharacter> fit(str, factorDelimiter);
+ size_t i = 0;
+ for (; i < MAX_NUM_FACTORS && fit; ++i,++fit)
+ bits[i] = *fit;
+ if (i == MAX_NUM_FACTORS)
+ UTIL_THROW_IF(fit, StrayFactorException,
+ "The hard limit for factors is " << MAX_NUM_FACTORS
+ << ". The word " << str << " contains factor delimiter "
+ << StaticData::Instance().GetFactorDelimiter()
+ << " too many times.");
+ if (strict)
+ UTIL_THROW_IF(fit, StrayFactorException,
+ "You have configured " << factorOrder.size()
+ << " factors but the word " << str
+ << " contains factor delimiter "
+ << StaticData::Instance().GetFactorDelimiter()
+ << " too many times.");
+ UTIL_THROW_IF(!isNonTerminal && i < factorOrder.size(),util::Exception,
+ "Too few factors in string '" << str << "'.");
+ }
+ else
+ {
+ bits[0] = str;
+ }
+ for (size_t k = 0; k < factorOrder.size(); ++k)
+ {
+ UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception,
+ "Factor order out of bounds.");
+ m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal);
+ }
// assume term/non-term same for all factors
m_isNonTerminal = isNonTerminal;
}
@@ -185,10 +188,10 @@ void Word::OnlyTheseFactors(const FactorMask &factors)
bool Word::IsEpsilon() const
{
- const Factor *factor = m_factorArray[0];
- int compare = factor->GetString().compare(EPSILON);
+ const Factor *factor = m_factorArray[0];
+ int compare = factor->GetString().compare(EPSILON);
- return compare == 0;
+ return compare == 0;
}
TO_STRING_BODY(Word);
diff --git a/moses/WordLattice.cpp b/moses/WordLattice.cpp
index 269ad46ca..01b89bfb8 100644
--- a/moses/WordLattice.cpp
+++ b/moses/WordLattice.cpp
@@ -56,7 +56,7 @@ InitializeFromPCNDataType
const std::vector<FactorType>& factorOrder,
const std::string& debug_line)
{
- const StaticData &staticData = StaticData::Instance();
+ // const StaticData &staticData = StaticData::Instance();
const InputFeature &inputFeature = InputFeature::Instance();
size_t numInputScores = inputFeature.GetNumInputScores();
size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();
diff --git a/moses/XmlOption.cpp b/moses/XmlOption.cpp
index 21ce0a411..2f66d647e 100644
--- a/moses/XmlOption.cpp
+++ b/moses/XmlOption.cpp
@@ -24,12 +24,18 @@
#include <vector>
#include <string>
#include <iostream>
+#include <boost/foreach.hpp>
+#include <boost/unordered_map.hpp>
#include "Util.h"
#include "StaticData.h"
#include "WordsRange.h"
#include "TargetPhrase.h"
#include "ReorderingConstraint.h"
#include "FactorCollection.h"
+#include "moses/TranslationModel/PhraseDictionary.h"
+#if PT_UG
+#include "TranslationModel/UG/mmsapt.h"
+#endif
namespace Moses
{
@@ -160,6 +166,12 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
const StaticData &staticData = StaticData::Instance();
+ // hack. What pt should XML trans opt be assigned to?
+ PhraseDictionary *firstPt = NULL;
+ if (PhraseDictionary::GetColl().size() == 0) {
+ firstPt = PhraseDictionary::GetColl()[0];
+ }
+
// no xml tag? we're done.
//if (line.find_first_of('<') == string::npos) {
if (line.find(lbrackStr) == string::npos) {
@@ -306,12 +318,94 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
placeholders.push_back(std::pair<size_t, std::string>(startPos, entity));
}
+ // update: add new aligned sentence pair to Mmsapt identified by name
+ else if (tagName == "update") {
+#if PT_UG
+ // get model name and aligned sentence pair
+ string pdName = ParseXmlTagAttribute(tagContent,"name");
+ string source = ParseXmlTagAttribute(tagContent,"source");
+ string target = ParseXmlTagAttribute(tagContent,"target");
+ string alignment = ParseXmlTagAttribute(tagContent,"alignment");
+ // find PhraseDictionary by name
+ const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl();
+ PhraseDictionary* pd = NULL;
+ for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) {
+ PhraseDictionary* curPd = *i;
+ if (curPd->GetScoreProducerDescription() == pdName) {
+ pd = curPd;
+ break;
+ }
+ }
+ if (pd == NULL) {
+ TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl);
+ return false;
+ }
+ // update model
+ VERBOSE(3,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl);
+ Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd);
+ pdsa->add(source, target, alignment);
+#else
+ TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl);
+ return false;
+#endif
+ }
+
+ // weight-overwrite: update feature weights, unspecified weights remain unchanged
+ // IMPORTANT: translation models that cache phrases or apply table-limit during load
+ // based on initial weights need to be reset. Sending an empty update will do this
+ // for PhraseDictionaryBitextSampling (Mmsapt) models:
+ // <update name="TranslationModelName" source=" " target=" " alignment=" " />
+ else if (tagName == "weight-overwrite") {
+
+ // is a name->ff map stored anywhere so we don't have to build it every time?
+ const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
+ boost::unordered_map<string, FeatureFunction*> map;
+ BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
+ map[ff->GetScoreProducerDescription()] = ff;
+ }
+
+ // update each weight listed
+ ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights();
+ boost::unordered_map<string, FeatureFunction*>::iterator ffi;
+ string ffName("");
+ vector<float> ffWeights;
+ vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights"));
+ BOOST_FOREACH(string const& tok, toks) {
+ if (tok.substr(tok.size() - 1, 1) == "=") {
+ // start new feature
+ if (ffName != "") {
+ // set previous feature weights
+ if (ffi != map.end()) {
+ allWeights.Assign(ffi->second, ffWeights);
+ }
+ ffWeights.clear();
+ }
+ ffName = tok.substr(0, tok.size() - 1);
+ ffi = map.find(ffName);
+ if (ffi == map.end()) {
+ TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl);
+ }
+ } else {
+ // weight for current feature
+ ffWeights.push_back(Scan<float>(tok));
+ }
+ }
+ if (ffi != map.end()) {
+ allWeights.Assign(ffi->second, ffWeights);
+ }
+ StaticData::InstanceNonConst().SetAllWeights(allWeights);
+ }
+
// default: opening tag that specifies translation options
else {
- if (startPos >= endPos) {
- TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
+ if (startPos > endPos) {
+ TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl);
return false;
}
+ else if (startPos == endPos) {
+ TRACE_ERR("WARNING: tag " << tagName << " 0 span: " << line << endl);
+ continue;
+ }
// specified translations -> vector of phrases
// multiple translations may be specified, separated by "||"
@@ -357,7 +451,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
float scoreValue = FloorScore(TransformScore(probValue));
WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase
- TargetPhrase targetPhrase;
+ TargetPhrase targetPhrase(firstPt);
// targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);
@@ -371,7 +465,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
}
targetPhrase.SetXMLScore(scoreValue);
- targetPhrase.Evaluate(sourcePhrase);
+ targetPhrase.EvaluateInIsolation(sourcePhrase);
XmlOption *option = new XmlOption(range,targetPhrase);
assert(option);
diff --git a/moses-cmd/mbr.cpp b/moses/mbr.cpp
index 6a8dfa823..6a8dfa823 100644
--- a/moses-cmd/mbr.cpp
+++ b/moses/mbr.cpp
diff --git a/moses-cmd/mbr.h b/moses/mbr.h
index d08b11a98..d08b11a98 100644
--- a/moses-cmd/mbr.h
+++ b/moses/mbr.h
diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp
index 0526d058b..4cbaba50c 100644
--- a/phrase-extract/DomainFeature.cpp
+++ b/phrase-extract/DomainFeature.cpp
@@ -2,9 +2,6 @@
#include "ExtractionPhrasePair.h"
#include "tables-core.h"
#include "InputFileStream.h"
-#include "SafeGetline.h"
-
-#define TABLE_LINE_MAX_LENGTH 1000
using namespace std;
@@ -16,12 +13,11 @@ void Domain::load( const std::string &domainFileName )
{
Moses::InputFileStream fileS( domainFileName );
istream *fileP = &fileS;
- while(true) {
- char line[TABLE_LINE_MAX_LENGTH];
- SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
- if (fileP->eof()) break;
+
+ string line;
+ while(getline(*fileP, line)) {
// read
- vector< string > domainSpecLine = tokenize( line );
+ vector< string > domainSpecLine = tokenize( line.c_str() );
int lineNumber;
if (domainSpecLine.size() != 2 ||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp
index a4f0e62fb..2fb93fab2 100644
--- a/phrase-extract/ExtractionPhrasePair.cpp
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@@ -19,7 +19,6 @@
#include <sstream>
#include "ExtractionPhrasePair.h"
-#include "SafeGetline.h"
#include "tables-core.h"
#include "score.h"
#include "moses/Util.h"
@@ -324,5 +323,238 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke
}
+std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
+ std::set<std::string>& labelSet,
+ boost::unordered_map<std::string,float>& countsLabelsLHS,
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& jointCountsRulesTargetLHSAndLabelsLHS,
+ Vocabulary &vcbT) const
+{
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
+
+ if ( allPropertyValues == NULL ) {
+ return "";
+ }
+
+ std::string lhs="", rhs="", currentRhs="";
+ float currentRhsCount = 0.0;
+ std::list< std::pair<std::string,float> > lhsGivenCurrentRhsCounts;
+
+ std::ostringstream oss;
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+ iter!=allPropertyValues->end(); ++iter) {
+
+ size_t space = (iter->first).find_last_of(' ');
+ if ( space == string::npos ) {
+ lhs = iter->first;
+ rhs.clear();
+ } else {
+ lhs = (iter->first).substr(space+1);
+ rhs = (iter->first).substr(0,space);
+ }
+
+ labelSet.insert(lhs);
+
+ if ( rhs.compare(currentRhs) ) {
+
+ if ( iter!=allPropertyValues->begin() ) {
+ if ( !currentRhs.empty() ) {
+ istringstream tokenizer(currentRhs);
+ std::string rhsLabel;
+ while ( tokenizer.peek() != EOF ) {
+ tokenizer >> rhsLabel;
+ labelSet.insert(rhsLabel);
+ }
+ oss << " " << currentRhs << " " << currentRhsCount;
+ }
+ if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
+ if ( !currentRhs.empty() ) {
+ oss << " " << lhsGivenCurrentRhsCounts.size();
+ }
+ for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
+ iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
+ oss << " " << iter2->first << " " << iter2->second;
+
+ // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
+ std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
+ ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
+ ruleTargetLhs.erase(ruleTargetLhs.size()-1);
+
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
+ countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ if (!insertedCountsLabelsLHS.second) {
+ (insertedCountsLabelsLHS.first)->second += iter2->second;
+ }
+
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
+ jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
+ if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
+ boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
+ } else {
+ boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ if (!insertedJointCounts.second) {
+ (insertedJointCounts.first)->second += iter2->second;
+ }
+ }
+
+ }
+ }
+
+ lhsGivenCurrentRhsCounts.clear();
+ }
+
+ currentRhsCount = 0.0;
+ currentRhs = rhs;
+ }
+
+ currentRhsCount += iter->second;
+ lhsGivenCurrentRhsCounts.push_back( std::pair<std::string,float>(lhs,iter->second) );
+ }
+
+ if ( !currentRhs.empty() ) {
+ istringstream tokenizer(currentRhs);
+ std::string rhsLabel;
+ while ( tokenizer.peek() != EOF ) {
+ tokenizer >> rhsLabel;
+ labelSet.insert(rhsLabel);
+ }
+ oss << " " << currentRhs << " " << currentRhsCount;
+ }
+ if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
+ if ( !currentRhs.empty() ) {
+ oss << " " << lhsGivenCurrentRhsCounts.size();
+ }
+ for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
+ iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
+ oss << " " << iter2->first << " " << iter2->second;
+
+ // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
+ std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
+ ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
+ ruleTargetLhs.erase(ruleTargetLhs.size()-1);
+
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
+ countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ if (!insertedCountsLabelsLHS.second) {
+ (insertedCountsLabelsLHS.first)->second += iter2->second;
+ }
+
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
+ jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
+ if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
+ boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
+ } else {
+ boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ if (!insertedJointCounts.second) {
+ (insertedJointCounts.first)->second += iter2->second;
+ }
+ }
+
+ }
+ }
+
+ std::string allPropertyValuesString(oss.str());
+ return allPropertyValuesString;
+}
+
+
+void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key,
+ const std::vector<float> &orientationClassPriorsL2R,
+ const std::vector<float> &orientationClassPriorsR2L,
+ double smoothingFactor,
+ std::ostream &out) const
+{
+ assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright
+
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+
+ if ( allPropertyValues == NULL ) {
+ return;
+ }
+
+ // bidirectional MSLR phrase orientation with 2x4 orientation classes:
+ // mono swap dright dleft
+ std::vector<float> orientationClassCountSumL2R(4,0);
+ std::vector<float> orientationClassCountSumR2L(4,0);
+
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+ iter!=allPropertyValues->end(); ++iter) {
+ std::string l2rOrientationClass, r2lOrientationClass;
+ try {
+ istringstream tokenizer(iter->first);
+ tokenizer >> l2rOrientationClass;
+ tokenizer >> r2lOrientationClass;
+ if ( tokenizer.peek() != EOF ) {
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+ << ": Collecting phrase orientations failed. "
+ << "Too many tokens?");
+ }
+ } catch (const std::exception &e) {
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+ << ": Collecting phrase orientations failed. "
+ << "Flawed property value in extract file?");
+ }
+
+ int l2rOrientationClassId = -1;
+ if (!l2rOrientationClass.compare("mono")) {
+ l2rOrientationClassId = 0;
+ }
+ if (!l2rOrientationClass.compare("swap")) {
+ l2rOrientationClassId = 1;
+ }
+ if (!l2rOrientationClass.compare("dleft")) {
+ l2rOrientationClassId = 2;
+ }
+ if (!l2rOrientationClass.compare("dright")) {
+ l2rOrientationClassId = 3;
+ }
+ if (l2rOrientationClassId == -1) {
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+ << ": Collecting phrase orientations failed. "
+ << "Unknown orientation class \"" << l2rOrientationClass << "\"." );
+ }
+ int r2lOrientationClassId = -1;
+ if (!r2lOrientationClass.compare("mono")) {
+ r2lOrientationClassId = 0;
+ }
+ if (!r2lOrientationClass.compare("swap")) {
+ r2lOrientationClassId = 1;
+ }
+ if (!r2lOrientationClass.compare("dleft")) {
+ r2lOrientationClassId = 2;
+ }
+ if (!r2lOrientationClass.compare("dright")) {
+ r2lOrientationClassId = 3;
+ }
+ if (r2lOrientationClassId == -1) {
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+ << ": Collecting phrase orientations failed. "
+ << "Unknown orientation class \"" << r2lOrientationClass << "\"." );
+ }
+
+ orientationClassCountSumL2R[l2rOrientationClassId] += iter->second;
+ orientationClassCountSumR2L[r2lOrientationClassId] += iter->second;
+ }
+
+ for (size_t i=0; i<4; ++i) {
+ if (i>0) {
+ out << " ";
+ }
+ out << (float)( (smoothingFactor*orientationClassPriorsL2R[i] + orientationClassCountSumL2R[i]) / (smoothingFactor + m_count) );
+ }
+ for (size_t i=0; i<4; ++i) {
+ out << " " << (float)( (smoothingFactor*orientationClassPriorsR2L[i] + orientationClassCountSumR2L[i]) / (smoothingFactor + m_count) );
+ }
+}
+
+
+
}
diff --git a/phrase-extract/ExtractionPhrasePair.h b/phrase-extract/ExtractionPhrasePair.h
index e9f643d2c..368033284 100644
--- a/phrase-extract/ExtractionPhrasePair.h
+++ b/phrase-extract/ExtractionPhrasePair.h
@@ -23,6 +23,7 @@
#include <vector>
#include <set>
#include <map>
+#include <boost/unordered_map.hpp>
namespace MosesTraining
{
@@ -126,6 +127,18 @@ public:
std::string CollectAllPropertyValues(const std::string &key) const;
+ std::string CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
+ std::set<std::string>& sourceLabelSet,
+ boost::unordered_map<std::string,float>& sourceLHSCounts,
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& sourceRHSAndLHSJointCounts,
+ Vocabulary &vcbT) const;
+
+ void CollectAllPhraseOrientations(const std::string &key,
+ const std::vector<float> &orientationClassPriorsL2R,
+ const std::vector<float> &orientationClassPriorsR2L,
+ double smoothingFactor,
+ std::ostream &out) const;
+
void AddProperties( const std::string &str, float count );
void AddProperty( const std::string &key, const std::string &value, float count ) {
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
index 50fed2973..0e35c437c 100644
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@@ -4,7 +4,7 @@ for local d in $(most-deps) {
obj $(d:B).o : $(d) ;
}
#and stuff them into an alias.
-alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ../moses//ThreadPool ../moses//Util ../util//kenutil ;
+alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ../moses//ThreadPool ../moses//Util ../util//kenutil ;
#ExtractionPhrasePair.cpp requires that main define some global variables.
#Build the mains that do not need these global variables.
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h
index ac22f333a..14d1575e1 100644
--- a/phrase-extract/PhraseExtractionOptions.h
+++ b/phrase-extract/PhraseExtractionOptions.h
@@ -18,7 +18,6 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
#include <string>
#include <vector>
diff --git a/phrase-extract/PropertiesConsolidator.cpp b/phrase-extract/PropertiesConsolidator.cpp
new file mode 100644
index 000000000..642c48672
--- /dev/null
+++ b/phrase-extract/PropertiesConsolidator.cpp
@@ -0,0 +1,159 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "PropertiesConsolidator.h"
+
+#include <sstream>
+#include <limits>
+#include <vector>
+
+#include "moses/Util.h"
+#include "phrase-extract/InputFileStream.h"
+#include "phrase-extract/OutputFileStream.h"
+
+
+namespace MosesTraining
+{
+
+void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
+{
+ Moses::InputFileStream inFile(sourceLabelSetFile);
+
+ // read source label set
+ m_sourceLabels.clear();
+ std::string line;
+ while (getline(inFile, line)) {
+ std::istringstream tokenizer(line);
+ std::string label;
+ size_t index;
+ try {
+ tokenizer >> label >> index;
+ } catch (const std::exception &e) {
+ UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
+ }
+ std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
+ UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
+ }
+
+ inFile.Close();
+
+ m_sourceLabelsFlag = true;
+}
+
+
+std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const
+{
+ if ( propertiesString.empty() ) {
+ return propertiesString;
+ }
+
+ std::ostringstream out;
+ std::vector<std::string> toks;
+ Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
+ for (size_t i = 1; i < toks.size(); ++i) {
+ std::string &tok = toks[i];
+ if (tok.empty()) {
+ continue;
+ }
+ size_t endPos = tok.rfind("}");
+ tok = tok.substr(0, endPos - 1);
+ std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
+ assert(keyValue.size() == 2);
+
+ if ( !keyValue[0].compare("SourceLabels") ) {
+
+ if ( m_sourceLabelsFlag ) {
+
+ // SourceLabels additional property: replace strings with vocabulary indices
+ out << " {{" << keyValue[0];
+
+ std::istringstream tokenizer(keyValue[1]);
+
+ size_t nNTs;
+ double totalCount;
+
+ if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
+ UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
+ << "Flawed SourceLabels property?");
+ }
+ assert( nNTs > 0 );
+ out << " " << nNTs;
+
+ if (! (tokenizer >> totalCount)) { // second token: overall rule count
+ UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
+ << "Flawed SourceLabels property?");
+ }
+ assert( totalCount > 0.0 );
+ out << " " << totalCount;
+
+ while (tokenizer.peek() != EOF) {
+ try {
+
+ size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
+
+ std::string token;
+
+ if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
+ for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
+ tokenizer >> token; // RHS source non-terminal label
+ std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
+ UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
+ out << " " << found->second;
+ }
+
+ tokenizer >> token; // sourceLabelsRHSCount
+ out << " " << token;
+
+ tokenizer >> numberOfLHSsGivenRHS;
+ out << " " << numberOfLHSsGivenRHS;
+ }
+
+ for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
+ tokenizer >> token; // LHS source non-terminal label
+ std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
+ UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
+ out << " " << found->second;
+
+ tokenizer >> token; // ruleSourceLabelledCount
+ out << " " << token;
+ }
+
+ } catch (const std::exception &e) {
+ UTIL_THROW2("Flawed item in SourceLabels property?");
+ }
+ }
+
+ out << "}}";
+
+ } else { // don't process source labels additional property
+ out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+ }
+
+ } else {
+
+ // output other additional property
+ out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+ }
+ }
+
+ return out.str();
+}
+
+} // namespace MosesTraining
+
diff --git a/contrib/other-builds/extract-mixed-syntax/XmlTree.h b/phrase-extract/PropertiesConsolidator.h
index cd54b8f17..cc6a7a835 100644
--- a/contrib/other-builds/extract-mixed-syntax/XmlTree.h
+++ b/phrase-extract/PropertiesConsolidator.h
@@ -1,35 +1,48 @@
-#pragma once
-
-// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2006 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-#include <string>
-#include <vector>
-#include <set>
-#include <map>
-#include "SyntaxTree.h"
-
-std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
-std::string TrimXml(const std::string& str);
-bool isXmlTag(const std::string& tag);
-inline std::vector<std::string> TokenizeXml(const std::string& str);
-bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection );
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+
+#pragma once
+
+#include <string>
+#include <map>
+
+
+namespace MosesTraining
+{
+
+class PropertiesConsolidator
+{
+public:
+
+ PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
+
+ void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
+
+ std::string ProcessPropertiesString(const std::string &propertiesString) const;
+
+private:
+
+ bool m_sourceLabelsFlag;
+ std::map<std::string,size_t> m_sourceLabels;
+
+};
+
+} // namespace MosesTraining
+
diff --git a/phrase-extract/SafeGetline.h b/phrase-extract/SafeGetline.h
deleted file mode 100644
index 0e03b8468..000000000
--- a/phrase-extract/SafeGetline.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/***********************************************************************
- Moses - factored phrase-based language decoder
- Copyright (C) 2010 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ***********************************************************************/
-
-#pragma once
-#ifndef SAFE_GETLINE_INCLUDED_
-#define SAFE_GETLINE_INCLUDED_
-
-#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM, _FILE) { \
- _IS.getline(_LINE, _SIZE, _DELIM); \
- if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
- if (_IS.gcount() == _SIZE-1) { \
- cerr << "Line too long! Buffer overflow. Delete lines >=" \
- << _SIZE << " chars or raise LINE_MAX_LENGTH in " << _FILE \
- << endl; \
- exit(1); \
- } \
- }
-
-#endif
diff --git a/phrase-extract/ScoreFeature.h b/phrase-extract/ScoreFeature.h
index 1f697c989..f94890ba1 100644
--- a/phrase-extract/ScoreFeature.h
+++ b/phrase-extract/ScoreFeature.h
@@ -90,7 +90,7 @@ public:
float count,
int sentenceId) const {};
- /** Add the values for this feature function. */
+ /** Add the values for this score feature. */
virtual void add(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const = 0;
diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 534ab177b..7af06405c 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -52,14 +52,18 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
{
//Check that configure rejects illegal domain arg combinations
ScoreFeatureManager manager;
- vector<string> args = boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null");
- BOOST_CHECK_THROW(manager.configure(args), ScoreFeatureArgumentException);
- args = boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null");
- BOOST_CHECK_THROW(manager.configure(args), ScoreFeatureArgumentException);
- args = boost::assign::list_of("--SparseDomainBlah")("/dev/null");
- BOOST_CHECK_THROW(manager.configure(args), ScoreFeatureArgumentException);
- args = boost::assign::list_of("--DomainSubset");
- BOOST_CHECK_THROW(manager.configure(args), ScoreFeatureArgumentException);
+ BOOST_CHECK_THROW(
+ manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")),
+ ScoreFeatureArgumentException);
+ BOOST_CHECK_THROW(
+ manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")),
+ ScoreFeatureArgumentException);
+ BOOST_CHECK_THROW(
+ manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")),
+ ScoreFeatureArgumentException);
+ BOOST_CHECK_THROW(
+ manager.configure(boost::assign::list_of("--DomainSubset")),
+ ScoreFeatureArgumentException);
}
template <class Expected>
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index c3d71d525..120c9154d 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -54,7 +54,11 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bo
return true;
}
-bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules)
+bool SentenceAlignment::create(const char targetString[],
+ const char sourceString[],
+ const char alignmentString[],
+ const char weightString[],
+ int sentenceID, bool boundaryRules)
{
using namespace std;
this->sentenceID = sentenceID;
diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h
index 1df61cf02..576d3279e 100644
--- a/phrase-extract/SentenceAlignment.h
+++ b/phrase-extract/SentenceAlignment.h
@@ -43,8 +43,11 @@ public:
virtual bool processSourceSentence(const char *, int, bool boundaryRules);
- bool create(char targetString[], char sourceString[],
- char alignmentString[], char weightString[], int sentenceID, bool boundaryRules);
+ bool create(const char targetString[],
+ const char sourceString[],
+ const char alignmentString[],
+ const char weightString[],
+ int sentenceID, bool boundaryRules);
void invertAlignment();
diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxTree.h
index 5ea7ccbed..8d65b99bd 100644
--- a/phrase-extract/SyntaxTree.h
+++ b/phrase-extract/SyntaxTree.h
@@ -96,9 +96,10 @@ protected:
friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
public:
- SyntaxTree() {
- m_top = 0; // m_top doesn't get set unless ConnectNodes is called.
- }
+ SyntaxTree()
+ : m_top(0) // m_top doesn't get set unless ConnectNodes is called.
+ , m_size(0) {}
+
~SyntaxTree();
SyntaxNode *AddNode( int startPos, int endPos, std::string label );
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index dcb974bef..ce7e6837e 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -248,7 +248,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
string cleanLine; // return string (text without xml)
size_t wordPos = 0; // position in sentence (in terms of number of words)
- bool isLinked = false;
// loop through the tokens
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
@@ -354,10 +353,14 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
// cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
- if (startPos >= endPos) {
- cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
+ if (startPos > endPos) {
+ cerr << "ERROR: tag " << tagName << " startPos is bigger than endPos (" << startPos << "-" << endPos << "): " << line << endl;
return false;
}
+ else if (startPos == endPos) {
+ cerr << "WARNING: tag " << tagName << ". Ignoring 0 span (" << startPos << "-" << endPos << "): " << line << endl;
+ continue;
+ }
string label = ParseXmlTagAttribute(tagContent,"label");
labelCollection.insert( label );
diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp
index 3b38f741c..40e0e35d4 100644
--- a/phrase-extract/consolidate-direct-main.cpp
+++ b/phrase-extract/consolidate-direct-main.cpp
@@ -26,16 +26,9 @@
#include "InputFileStream.h"
#include "OutputFileStream.h"
-#include "SafeGetline.h"
-
-#define LINE_MAX_LENGTH 10000
-
using namespace std;
-char line[LINE_MAX_LENGTH];
-
-
-vector< string > splitLine()
+vector< string > splitLine(const char *line)
{
vector< string > item;
int start=0;
@@ -61,14 +54,15 @@ bool getLine( istream &fileP, vector< string > &item )
{
if (fileP.eof())
return false;
-
- SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- if (fileP.eof())
+
+ string line;
+ if (getline(fileP, line)) {
+ item = splitLine(line.c_str());
return false;
-
- item = splitLine();
-
- return true;
+ }
+ else {
+ return false;
+ }
}
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index de0d7f646..10697a956 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -26,11 +26,9 @@
#include <cstring>
#include "tables-core.h"
-#include "SafeGetline.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
-
-#define LINE_MAX_LENGTH 10000
+#include "PropertiesConsolidator.h"
using namespace std;
@@ -40,18 +38,18 @@ bool phraseCountFlag = false;
bool lowCountFlag = false;
bool goodTuringFlag = false;
bool kneserNeyFlag = false;
+bool sourceLabelsFlag = false;
bool logProbFlag = false;
inline float maybeLogProb( float a )
{
return logProbFlag ? log(a) : a;
}
-char line[LINE_MAX_LENGTH];
-void processFiles( char*, char*, char*, char* );
+void processFiles( char*, char*, char*, char*, char* );
void loadCountOfCounts( char* );
void breakdownCoreAndSparse( string combined, string &core, string &sparse );
bool getLine( istream &fileP, vector< string > &item );
-vector< string > splitLine();
+vector< string > splitLine(const char *line);
vector< int > countBin;
bool sparseCountBinFeatureFlag = false;
@@ -61,13 +59,14 @@ int main(int argc, char* argv[])
<< "consolidating direct and indirect rule tables\n";
if (argc < 4) {
- cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] \n";
+ cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] \n";
exit(1);
}
char* &fileNameDirect = argv[1];
char* &fileNameIndirect = argv[2];
char* &fileNameConsolidated = argv[3];
char* fileNameCountOfCounts;
+ char* fileNameSourceLabelSet;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"--Hierarchical") == 0) {
@@ -118,13 +117,21 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
cerr << "using log-probabilities\n";
+ } else if (strcmp(argv[i],"--SourceLabels") == 0) {
+ sourceLabelsFlag = true;
+ if (i+1==argc) {
+ cerr << "ERROR: specify source label set file!\n";
+ exit(1);
+ }
+ fileNameSourceLabelSet = argv[++i];
+ cerr << "processing source labels property\n";
} else {
cerr << "ERROR: unknown option " << argv[i] << endl;
exit(1);
}
}
- processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
+ processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
}
vector< float > countOfCounts;
@@ -140,14 +147,13 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
istream &fileP = fileCountOfCounts;
countOfCounts.push_back(0.0);
- while(1) {
- if (fileP.eof()) break;
- SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- if (fileP.eof()) break;
+
+ string line;
+ while (getline(fileP, line)) {
if (totalCount < 0)
- totalCount = atof(line); // total number of distinct phrase pairs
+ totalCount = atof(line.c_str()); // total number of distinct phrase pairs
else
- countOfCounts.push_back( atof(line) );
+ countOfCounts.push_back( atof(line.c_str()) );
}
fileCountOfCounts.Close();
@@ -174,7 +180,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
}
-void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
+void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
{
if (goodTuringFlag || kneserNeyFlag)
loadCountOfCounts( fileNameCountOfCounts );
@@ -203,6 +209,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
exit(1);
}
+ // create properties consolidator
+ // (in case any additional phrase property requires further processing)
+ MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
+ if (sourceLabelsFlag) {
+ propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
+ }
+
// loop through all extracted phrase translations
int i=0;
while(true) {
@@ -312,12 +325,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// counts, for debugging
fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
- // count bin feature (as a sparse feature)
+ // sparse features
fileConsolidated << " |||";
if (directSparseScores.compare("") != 0)
fileConsolidated << " " << directSparseScores;
if (indirectSparseScores.compare("") != 0)
fileConsolidated << " " << indirectSparseScores;
+ // count bin feature (as a sparse feature)
if (sparseCountBinFeatureFlag) {
bool foundBin = false;
for(size_t i=0; i < countBin.size(); i++) {
@@ -337,8 +351,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
}
// arbitrary key-value pairs
+ fileConsolidated << " |||";
if (itemDirect.size() >= 6) {
- fileConsolidated << " ||| " << itemDirect[5];
+ //if (sourceLabelsFlag) {
+ fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
+ //} else {
+ // fileConsolidated << itemDirect[5];
+ //}
}
fileConsolidated << endl;
@@ -370,16 +389,16 @@ bool getLine( istream &fileP, vector< string > &item )
if (fileP.eof())
return false;
- SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- if (fileP.eof())
+ string line;
+ if (!getline(fileP, line))
return false;
- item = splitLine();
+ item = splitLine(line.c_str());
return true;
}
-vector< string > splitLine()
+vector< string > splitLine(const char *line)
{
vector< string > item;
int start=0;
diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp
index 6843bf3aa..ce59315b9 100644
--- a/phrase-extract/consolidate-reverse-main.cpp
+++ b/phrase-extract/consolidate-reverse-main.cpp
@@ -27,23 +27,19 @@
#include <cstring>
#include "tables-core.h"
-#include "SafeGetline.h"
#include "InputFileStream.h"
-#define LINE_MAX_LENGTH 10000
-
using namespace std;
bool hierarchicalFlag = false;
bool onlyDirectFlag = false;
bool phraseCountFlag = true;
bool logProbFlag = false;
-char line[LINE_MAX_LENGTH];
void processFiles( char*, char*, char* );
bool getLine( istream &fileP, vector< string > &item );
string reverseAlignment(const string &alignments);
-vector< string > splitLine();
+vector< string > splitLine(const char *lin);
inline void Tokenize(std::vector<std::string> &output
, const std::string& str
@@ -190,17 +186,18 @@ bool getLine( istream &fileP, vector< string > &item )
{
if (fileP.eof())
return false;
-
- SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- if (fileP.eof())
+
+ string line;
+ if (getline(fileP, line)) {
+ item = splitLine(line.c_str());
return false;
-
- item = splitLine();
-
- return true;
+ }
+ else {
+ return false;
+ }
}
-vector< string > splitLine()
+vector< string > splitLine(const char *line)
{
vector< string > item;
bool betweenWords = true;
diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp
index 44cf2006c..7e084e495 100644
--- a/phrase-extract/extract-ghkm/Alignment.cpp
+++ b/phrase-extract/extract-ghkm/Alignment.cpp
@@ -21,6 +21,7 @@
#include "Exception.h"
+#include <algorithm>
#include <cassert>
#include <cstdlib>
@@ -61,5 +62,12 @@ void ReadAlignment(const std::string &s, Alignment &a)
}
}
+void FlipAlignment(Alignment &a)
+{
+ for (Alignment::iterator p = a.begin(); p != a.end(); ++p) {
+ std::swap(p->first, p->second);
+ }
+}
+
} // namespace GHKM
} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h
index 8dbaf483f..e8381a602 100644
--- a/phrase-extract/extract-ghkm/Alignment.h
+++ b/phrase-extract/extract-ghkm/Alignment.h
@@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
-#ifndef EXTRACT_GHKM_ALIGNMENT_H_
-#define EXTRACT_GHKM_ALIGNMENT_H_
#include <string>
#include <utility>
@@ -34,7 +32,8 @@ typedef std::vector<std::pair<int, int> > Alignment;
void ReadAlignment(const std::string &, Alignment &);
+void FlipAlignment(Alignment &);
+
} // namespace GHKM
} // namespace Moses
-#endif
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index c94ba3fda..7c210541d 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -27,9 +27,16 @@
#include "OutputFileStream.h"
#include "Options.h"
#include "ParseTree.h"
+#include "PhraseOrientation.h"
#include "ScfgRule.h"
#include "ScfgRuleWriter.h"
#include "Span.h"
+#include "StsgRule.h"
+#include "StsgRuleWriter.h"
+#include "SyntaxTree.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
#include "XmlTreeParser.h"
#include <boost/program_options.hpp>
@@ -55,16 +62,29 @@ int ExtractGHKM::Main(int argc, char *argv[])
ProcessOptions(argc, argv, options);
// Open input files.
- InputFileStream targetStream(options.targetFile);
- InputFileStream sourceStream(options.sourceFile);
+ //
+ // The GHKM algorithm is neutral about whether the model is string-to-tree or
+ // tree-to-string. This implementation assumes the model to be
+ // string-to-tree, but if the -t2s option is given then the source and target
+ // input files are switched prior to extraction and then the source and
+ // target of the extracted rules are switched on output.
+ std::string effectiveTargetFile = options.t2s ? options.sourceFile
+ : options.targetFile;
+ std::string effectiveSourceFile = options.t2s ? options.targetFile
+ : options.sourceFile;
+ InputFileStream targetStream(effectiveTargetFile);
+ InputFileStream sourceStream(effectiveSourceFile);
InputFileStream alignmentStream(options.alignmentFile);
// Open output files.
OutputFileStream fwdExtractStream;
OutputFileStream invExtractStream;
- std::ofstream glueGrammarStream;
- std::ofstream unknownWordStream;
- std::ofstream unknownWordSoftMatchesStream;
+ OutputFileStream glueGrammarStream;
+ OutputFileStream targetUnknownWordStream;
+ OutputFileStream sourceUnknownWordStream;
+ OutputFileStream sourceLabelSetStream;
+ OutputFileStream unknownWordSoftMatchesStream;
+
std::string fwdFileName = options.extractFile;
std::string invFileName = options.extractFile + std::string(".inv");
if (options.gzOutput) {
@@ -73,30 +93,50 @@ int ExtractGHKM::Main(int argc, char *argv[])
}
OpenOutputFileOrDie(fwdFileName, fwdExtractStream);
OpenOutputFileOrDie(invFileName, invExtractStream);
+
if (!options.glueGrammarFile.empty()) {
OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream);
}
- if (!options.unknownWordFile.empty()) {
- OpenOutputFileOrDie(options.unknownWordFile, unknownWordStream);
+ if (!options.targetUnknownWordFile.empty()) {
+ OpenOutputFileOrDie(options.targetUnknownWordFile, targetUnknownWordStream);
+ }
+ if (!options.sourceUnknownWordFile.empty()) {
+ OpenOutputFileOrDie(options.sourceUnknownWordFile, sourceUnknownWordStream);
+ }
+ if (!options.sourceLabelSetFile.empty()) {
+ if (!options.sourceLabels) {
+ Error("SourceLabels should be active if SourceLabelSet is supposed to be written to a file");
+ }
+ OpenOutputFileOrDie(options.sourceLabelSetFile, sourceLabelSetStream); // note that this is not a global source label set if extraction is parallelized
}
if (!options.unknownWordSoftMatchesFile.empty()) {
OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
}
// Target label sets for producing glue grammar.
- std::set<std::string> labelSet;
- std::map<std::string, int> topLabelSet;
+ std::set<std::string> targetLabelSet;
+ std::map<std::string, int> targetTopLabelSet;
+
+ // Source label sets for producing glue grammar.
+ std::set<std::string> sourceLabelSet;
+ std::map<std::string, int> sourceTopLabelSet;
// Word count statistics for producing unknown word labels.
- std::map<std::string, int> wordCount;
- std::map<std::string, std::string> wordLabel;
+ std::map<std::string, int> targetWordCount;
+ std::map<std::string, std::string> targetWordLabel;
+
+ // Word count statistics for producing unknown word labels: source side.
+ std::map<std::string, int> sourceWordCount;
+ std::map<std::string, std::string> sourceWordLabel;
std::string targetLine;
std::string sourceLine;
std::string alignmentLine;
Alignment alignment;
- XmlTreeParser xmlTreeParser(labelSet, topLabelSet);
- ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options);
+ XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
+// XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
+ ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
+ StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
size_t lineNum = options.sentenceOffset;
while (true) {
std::getline(targetStream, targetLine);
@@ -118,44 +158,93 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::cerr << "skipping line " << lineNum << " with empty target tree\n";
continue;
}
- std::auto_ptr<ParseTree> t;
+ std::auto_ptr<ParseTree> targetParseTree;
try {
- t = xmlTreeParser.Parse(targetLine);
- assert(t.get());
+ targetParseTree = targetXmlTreeParser.Parse(targetLine);
+ assert(targetParseTree.get());
} catch (const Exception &e) {
- std::ostringstream s;
- s << "Failed to parse XML tree at line " << lineNum;
+ std::ostringstream oss;
+ oss << "Failed to parse target XML tree at line " << lineNum;
if (!e.GetMsg().empty()) {
- s << ": " << e.GetMsg();
+ oss << ": " << e.GetMsg();
+ }
+ Error(oss.str());
+ }
+
+
+ // Parse source tree and construct a SyntaxTree object.
+ MosesTraining::SyntaxTree sourceSyntaxTree;
+ MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL;
+
+ if (options.sourceLabels) {
+ try {
+ if (!ProcessAndStripXMLTags(sourceLine, sourceSyntaxTree, sourceLabelSet, sourceTopLabelSet, false)) {
+ throw Exception("");
+ }
+ sourceSyntaxTree.ConnectNodes();
+ sourceSyntaxTreeRoot = sourceSyntaxTree.GetTop();
+ assert(sourceSyntaxTreeRoot);
+ } catch (const Exception &e) {
+ std::ostringstream oss;
+ oss << "Failed to parse source XML tree at line " << lineNum;
+ if (!e.GetMsg().empty()) {
+ oss << ": " << e.GetMsg();
+ }
+ Error(oss.str());
}
- Error(s.str());
}
// Read source tokens.
std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
+ // Construct a source ParseTree object from the SyntaxTree object.
+ std::auto_ptr<ParseTree> sourceParseTree;
+
+ if (options.sourceLabels) {
+ try {
+ sourceParseTree = XmlTreeParser::ConvertTree(*sourceSyntaxTreeRoot, sourceTokens);
+ assert(sourceParseTree.get());
+ } catch (const Exception &e) {
+ std::ostringstream oss;
+ oss << "Failed to parse source XML tree at line " << lineNum;
+ if (!e.GetMsg().empty()) {
+ oss << ": " << e.GetMsg();
+ }
+ Error(oss.str());
+ }
+ }
+
+
// Read word alignments.
try {
ReadAlignment(alignmentLine, alignment);
} catch (const Exception &e) {
- std::ostringstream s;
- s << "Failed to read alignment at line " << lineNum << ": ";
- s << e.GetMsg();
- Error(s.str());
+ std::ostringstream oss;
+ oss << "Failed to read alignment at line " << lineNum << ": ";
+ oss << e.GetMsg();
+ Error(oss.str());
}
if (alignment.size() == 0) {
std::cerr << "skipping line " << lineNum << " without alignment points\n";
continue;
}
+ if (options.t2s) {
+ FlipAlignment(alignment);
+ }
// Record word counts.
- if (!options.unknownWordFile.empty()) {
- CollectWordLabelCounts(*t, options, wordCount, wordLabel);
+ if (!options.targetUnknownWordFile.empty()) {
+ CollectWordLabelCounts(*targetParseTree, options, targetWordCount, targetWordLabel);
+ }
+
+ // Record word counts: source side.
+ if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
+ CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount, sourceWordLabel);
}
// Form an alignment graph from the target tree, source words, and
// alignment.
- AlignmentGraph graph(t.get(), sourceTokens, alignment);
+ AlignmentGraph graph(targetParseTree.get(), sourceTokens, alignment);
// Extract minimal rules, adding each rule to its root node's rule set.
graph.ExtractMinimalRules(options);
@@ -165,36 +254,103 @@ int ExtractGHKM::Main(int argc, char *argv[])
graph.ExtractComposedRules(options);
}
+ // Initialize phrase orientation scoring object
+ PhraseOrientation phraseOrientation( sourceTokens.size(), targetXmlTreeParser.GetWords().size(), alignment);
+
// Write the rules, subject to scope pruning.
const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
for (std::vector<Node *>::const_iterator p = targetNodes.begin();
p != targetNodes.end(); ++p) {
+
const std::vector<const Subgraph *> &rules = (*p)->GetRules();
+
+ Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN;
+ if (options.phraseOrientation && !rules.empty()) {
+ int sourceSpanBegin = *((*p)->GetSpan().begin());
+ int sourceSpanEnd = *((*p)->GetSpan().rbegin());
+ l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_L2R);
+ r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_R2L);
+ // std::cerr << "span " << sourceSpanBegin << " " << sourceSpanEnd << std::endl;
+ // std::cerr << "phraseOrientation " << phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd) << std::endl;
+ }
+
for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
q != rules.end(); ++q) {
- ScfgRule r(**q);
+ // STSG output.
+ if (options.stsg) {
+ StsgRule rule(**q);
+ if (rule.Scope() <= options.maxScope) {
+ stsgWriter.Write(rule);
+ }
+ continue;
+ }
+ // SCFG output.
+ ScfgRule *r = 0;
+ if (options.sourceLabels) {
+ r = new ScfgRule(**q, &sourceSyntaxTree);
+ } else {
+ r = new ScfgRule(**q);
+ }
// TODO Can scope pruning be done earlier?
- if (r.Scope() <= options.maxScope) {
+ if (r->Scope() <= options.maxScope) {
if (!options.treeFragments) {
- writer.Write(r);
+ scfgWriter.Write(*r,false);
} else {
- writer.Write(r,**q);
+ scfgWriter.Write(*r,**q,false);
}
+ if (options.phraseOrientation) {
+ fwdExtractStream << " {{Orientation ";
+ phraseOrientation.WriteOrientation(fwdExtractStream,l2rOrientation);
+ fwdExtractStream << " ";
+ phraseOrientation.WriteOrientation(fwdExtractStream,r2lOrientation);
+ fwdExtractStream << "}}";
+ phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_L2R,l2rOrientation,1);
+ phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_R2L,r2lOrientation,1);
+ }
+ fwdExtractStream << std::endl;
+ invExtractStream << std::endl;
}
+ delete r;
}
}
}
+ if (options.phraseOrientation) {
+ std::string phraseOrientationPriorsFileName = options.extractFile + std::string(".phraseOrientationPriors");
+ OutputFileStream phraseOrientationPriorsStream;
+ OpenOutputFileOrDie(phraseOrientationPriorsFileName, phraseOrientationPriorsStream);
+ PhraseOrientation::WritePriorCounts(phraseOrientationPriorsStream);
+ }
+
+ std::map<std::string,size_t> sourceLabels;
+ if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
+
+ sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side)
+ sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side)
+ sourceLabelSet.insert("TOPLABEL"); // as used in the glue grammar
+ sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar
+ size_t index = 0;
+ for (std::set<std::string>::const_iterator iter=sourceLabelSet.begin();
+ iter!=sourceLabelSet.end(); ++iter, ++index) {
+ sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
+ }
+ WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
+ }
+
if (!options.glueGrammarFile.empty()) {
- WriteGlueGrammar(labelSet, topLabelSet, glueGrammarStream);
+ WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream);
}
- if (!options.unknownWordFile.empty()) {
- WriteUnknownWordLabel(wordCount, wordLabel, options, unknownWordStream);
+ if (!options.targetUnknownWordFile.empty()) {
+ WriteUnknownWordLabel(targetWordCount, targetWordLabel, options, targetUnknownWordStream);
+ }
+
+ if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
+ WriteUnknownWordLabel(sourceWordCount, sourceWordLabel, options, sourceUnknownWordStream, true);
}
if (!options.unknownWordSoftMatchesFile.empty()) {
- WriteUnknownWordSoftMatches(labelSet, unknownWordSoftMatchesStream);
+ WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream);
}
return 0;
@@ -303,14 +459,28 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
"extract minimal rules only")
("PCFG",
"include score based on PCFG scores in target corpus")
+ ("PhraseOrientation",
+ "output phrase orientation information")
+ ("STSG",
+ "output STSG rules (default is SCFG)")
+ ("T2S",
+ "enable tree-to-string rule extraction (string-to-tree is assumed by default)")
("TreeFragments",
"output parse tree information")
+ ("SourceLabels",
+ "output source syntax label information")
+ ("SourceLabelSet",
+ po::value(&options.sourceLabelSetFile),
+ "write source syntax label set to named file")
("SentenceOffset",
po::value(&options.sentenceOffset)->default_value(options.sentenceOffset),
"set sentence number offset if processing split corpus")
("UnknownWordLabel",
- po::value(&options.unknownWordFile),
+ po::value(&options.targetUnknownWordFile),
"write unknown word labels to named file")
+ ("SourceUnknownWordLabel",
+ po::value(&options.sourceUnknownWordFile),
+ "write source syntax unknown word labels to named file")
("UnknownWordMinRelFreq",
po::value(&options.unknownWordMinRelFreq)->default_value(
options.unknownWordMinRelFreq),
@@ -399,9 +569,21 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("PCFG")) {
options.pcfg = true;
}
+ if (vm.count("PhraseOrientation")) {
+ options.phraseOrientation = true;
+ }
+ if (vm.count("STSG")) {
+ options.stsg = true;
+ }
+ if (vm.count("T2S")) {
+ options.t2s = true;
+ }
if (vm.count("TreeFragments")) {
options.treeFragments = true;
}
+ if (vm.count("SourceLabels")) {
+ options.sourceLabels = true;
+ }
if (vm.count("UnknownWordUniform")) {
options.unknownWordUniform = true;
}
@@ -411,7 +593,10 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
// Workaround for extract-parallel issue.
if (options.sentenceOffset > 0) {
- options.unknownWordFile.clear();
+ options.targetUnknownWordFile.clear();
+ }
+ if (options.sentenceOffset > 0) {
+ options.sourceUnknownWordFile.clear();
options.unknownWordSoftMatchesFile.clear();
}
}
@@ -422,7 +607,7 @@ void ExtractGHKM::Error(const std::string &msg) const
std::exit(1);
}
-std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s)
+std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s) const
{
std::vector<std::string> tokens;
@@ -454,9 +639,11 @@ std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s)
void ExtractGHKM::WriteGlueGrammar(
const std::set<std::string> &labelSet,
const std::map<std::string, int> &topLabelSet,
+ const std::map<std::string,size_t> &sourceLabels,
+ const Options &options,
std::ostream &out)
{
- // chose a top label that is not already a label
+ // choose a top label that is not already a label
std::string topLabel = "QQQQQQ";
for(size_t i = 1; i <= topLabel.length(); i++) {
if (labelSet.find(topLabel.substr(0,i)) == labelSet.end() ) {
@@ -465,23 +652,89 @@ void ExtractGHKM::WriteGlueGrammar(
}
}
+ size_t sourceLabelGlueTop = 0;
+ size_t sourceLabelGlueX = 1;
+
// basic rules
- out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| ||| {{Tree [" << topLabel << " <s>]}}" << std::endl;
- out << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 ||| ||| ||| {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}" << std::endl;
+ out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| 0-0 ||| ||| |||";
+ if (options.treeFragments) {
+ out << " {{Tree [" << topLabel << " [SSTART <s>]]}}";
+ }
+ if (options.sourceLabels) {
+ out << " {{SourceLabels 1 1 " << sourceLabelGlueTop << " 1}}";
+ }
+ if (options.phraseOrientation) {
+ out << " {{Orientation 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25}}";
+ }
+ out << std::endl;
+
+ out << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1 ||| ||| |||";
+ if (options.treeFragments) {
+ out << " {{Tree [" << topLabel << " [" << topLabel << "] [SEND </s>]]}}";
+ }
+ if (options.sourceLabels) {
+ out << " {{SourceLabels 2 1 " << sourceLabelGlueTop << " 1 1 " << sourceLabelGlueTop << " 1}}";
+ }
+ if (options.phraseOrientation) {
+ out << " {{Orientation 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25}}";
+ }
+ out << std::endl;
// top rules
for (std::map<std::string, int>::const_iterator i = topLabelSet.begin();
i != topLabelSet.end(); ++i) {
- out << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1 ||| ||| ||| {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}" << std::endl;
+ out << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2 ||| ||| |||";
+ if (options.treeFragments) {
+ out << " {{Tree [" << topLabel << " [SSTART <s>] [" << i->first << "] [SEND </s>]]}}";
+ }
+ if (options.sourceLabels) {
+ out << " {{SourceLabels 2 1 " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
+ }
+ if (options.phraseOrientation) {
+ out << " {{Orientation 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25}}";
+ }
+ out << std::endl;
}
// glue rules
for(std::set<std::string>::const_iterator i = labelSet.begin();
i != labelSet.end(); i++ ) {
- out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| ||| {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}" << std::endl;
+ out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| |||";
+ if (options.treeFragments) {
+ out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
+ }
+ if (options.sourceLabels) {
+ out << " {{SourceLabels 3 2.718 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 2.718 1 " << sourceLabelGlueTop << " 2.718}}";
+ }
+ if (options.phraseOrientation) {
+ out << " {{Orientation 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25}}";
+ }
+ out << std::endl;
}
+
// glue rule for unknown word...
- out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| ||| {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}" << std::endl;
+ out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| |||";
+ if (options.treeFragments) {
+ out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
+ }
+ if (options.sourceLabels) {
+ out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
+ }
+ if (options.phraseOrientation) {
+ out << " {{Orientation 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25}}";
+ }
+ out << std::endl;
+}
+
+void ExtractGHKM::WriteSourceLabelSet(
+ const std::map<std::string,size_t> &sourceLabels,
+ std::ostream &out)
+{
+ out << sourceLabels.size() << std::endl;
+ for (std::map<std::string,size_t>::const_iterator iter=sourceLabels.begin();
+ iter!=sourceLabels.end(); ++iter) {
+ out << iter->first << " " << iter->second << std::endl;
+ }
}
void ExtractGHKM::CollectWordLabelCounts(
@@ -513,11 +766,26 @@ void ExtractGHKM::CollectWordLabelCounts(
}
}
+std::vector<std::string> ExtractGHKM::ReadTokens(const ParseTree &root) const
+{
+ std::vector<std::string> tokens;
+ std::vector<const ParseTree*> leaves;
+ root.GetLeaves(std::back_inserter(leaves));
+ for (std::vector<const ParseTree *>::const_iterator p = leaves.begin();
+ p != leaves.end(); ++p) {
+ const ParseTree &leaf = **p;
+ const std::string &word = leaf.GetLabel();
+ tokens.push_back(word);
+ }
+ return tokens;
+}
+
void ExtractGHKM::WriteUnknownWordLabel(
const std::map<std::string, int> &wordCount,
const std::map<std::string, std::string> &wordLabel,
const Options &options,
- std::ostream &out)
+ std::ostream &out,
+ bool writeCounts)
{
if (!options.unknownWordSoftMatchesFile.empty()) {
out << "UNK 1" << std::endl;
@@ -537,12 +805,19 @@ void ExtractGHKM::WriteUnknownWordLabel(
++total;
}
}
- for (std::map<std::string, int>::const_iterator p = labelCount.begin();
- p != labelCount.end(); ++p) {
- double ratio = static_cast<double>(p->second) / static_cast<double>(total);
- if (ratio >= options.unknownWordMinRelFreq) {
- float weight = options.unknownWordUniform ? 1.0f : ratio;
- out << p->first << " " << weight << std::endl;
+ if ( writeCounts ) {
+ for (std::map<std::string, int>::const_iterator p = labelCount.begin();
+ p != labelCount.end(); ++p) {
+ out << p->first << " " << p->second << std::endl;
+ }
+ } else {
+ for (std::map<std::string, int>::const_iterator p = labelCount.begin();
+ p != labelCount.end(); ++p) {
+ double ratio = static_cast<double>(p->second) / static_cast<double>(total);
+ if (ratio >= options.unknownWordMinRelFreq) {
+ float weight = options.unknownWordUniform ? 1.0f : ratio;
+ out << p->first << " " << weight << std::endl;
+ }
}
}
}
@@ -551,10 +826,9 @@ void ExtractGHKM::WriteUnknownWordSoftMatches(
const std::set<std::string> &labelSet,
std::ostream &out)
{
- std::set<std::string>::const_iterator p = labelSet.begin();
- for (p; p != labelSet.end(); ++p) {
- std::string label = *p;
- out << "UNK " << label << std::endl;
+ for (std::set<std::string>::const_iterator p = labelSet.begin(); p != labelSet.end(); ++p) {
+ std::string label = *p;
+ out << "UNK " << label << std::endl;
}
}
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h
index 15db9af86..df54ed250 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.h
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.h
@@ -59,13 +59,19 @@ private:
void WriteUnknownWordLabel(const std::map<std::string, int> &,
const std::map<std::string, std::string> &,
const Options &,
- std::ostream &);
+ std::ostream &,
+ bool writeCounts=false);
void WriteUnknownWordSoftMatches(const std::set<std::string> &,
std::ostream &);
void WriteGlueGrammar(const std::set<std::string> &,
const std::map<std::string, int> &,
+ const std::map<std::string,size_t> &,
+ const Options &,
std::ostream &);
- std::vector<std::string> ReadTokens(const std::string &);
+ void WriteSourceLabelSet(const std::map<std::string,size_t> &,
+ std::ostream &);
+ std::vector<std::string> ReadTokens(const std::string &) const;
+ std::vector<std::string> ReadTokens(const ParseTree &root) const;
void ProcessOptions(int, char *[], Options &) const;
diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h
index ffa9bfa35..48b866c17 100644
--- a/phrase-extract/extract-ghkm/Options.h
+++ b/phrase-extract/extract-ghkm/Options.h
@@ -40,11 +40,15 @@ public:
, maxScope(3)
, minimal(false)
, pcfg(false)
- , treeFragments(false)
+ , phraseOrientation(false)
, sentenceOffset(0)
- , unpairedExtractFormat(false)
+ , sourceLabels(false)
+ , stsg(false)
+ , t2s(false)
+ , treeFragments(false)
, unknownWordMinRelFreq(0.03f)
- , unknownWordUniform(false) {}
+ , unknownWordUniform(false)
+ , unpairedExtractFormat(false) {}
// Positional options
std::string targetFile;
@@ -63,13 +67,19 @@ public:
int maxScope;
bool minimal;
bool pcfg;
- bool treeFragments;
+ bool phraseOrientation;
int sentenceOffset;
- bool unpairedExtractFormat;
- std::string unknownWordFile;
- std::string unknownWordSoftMatchesFile;
+ bool sourceLabels;
+ std::string sourceLabelSetFile;
+ std::string sourceUnknownWordFile;
+ bool stsg;
+ bool t2s;
+ std::string targetUnknownWordFile;
+ bool treeFragments;
float unknownWordMinRelFreq;
+ std::string unknownWordSoftMatchesFile;
bool unknownWordUniform;
+ bool unpairedExtractFormat;
};
} // namespace GHKM
diff --git a/phrase-extract/extract-ghkm/ParseTree.h b/phrase-extract/extract-ghkm/ParseTree.h
index 03da17735..694286c9d 100644
--- a/phrase-extract/extract-ghkm/ParseTree.h
+++ b/phrase-extract/extract-ghkm/ParseTree.h
@@ -63,7 +63,7 @@ public:
bool IsLeaf() const;
template<typename OutputIterator>
- void GetLeaves(OutputIterator);
+ void GetLeaves(OutputIterator) const;
private:
// Disallow copying
@@ -77,7 +77,7 @@ private:
};
template<typename OutputIterator>
-void ParseTree::GetLeaves(OutputIterator result)
+void ParseTree::GetLeaves(OutputIterator result) const
{
if (IsLeaf()) {
*result++ = this;
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
new file mode 100644
index 000000000..5a8452f42
--- /dev/null
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
@@ -0,0 +1,433 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "PhraseOrientation.h"
+
+#include <iostream>
+#include <sstream>
+#include <limits>
+#include <cassert>
+
+#include <boost/assign/list_of.hpp>
+
+namespace Moses
+{
+namespace GHKM
+{
+
+std::vector<float> PhraseOrientation::m_l2rOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
+std::vector<float> PhraseOrientation::m_r2lOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
+
+PhraseOrientation::PhraseOrientation(int sourceSize,
+ int targetSize,
+ const Alignment &alignment)
+ : m_countF(sourceSize)
+ , m_countE(targetSize)
+{
+
+ // prepare data structures for alignments
+ std::vector<std::vector<int> > alignedToS;
+ for(int i=0; i<m_countF; ++i) {
+ std::vector< int > dummy;
+ alignedToS.push_back(dummy);
+ }
+ for(int i=0; i<m_countE; ++i) {
+ std::vector< int > dummy;
+ m_alignedToT.push_back(dummy);
+ }
+ std::vector<int> alignedCountS(m_countF,0);
+
+ for (Alignment::const_iterator a=alignment.begin(); a!=alignment.end(); ++a) {
+ m_alignedToT[a->second].push_back(a->first);
+ alignedCountS[a->first]++;
+ alignedToS[a->first].push_back(a->second);
+ }
+
+ for (int startF=0; startF<m_countF; ++startF) {
+ for (int endF=startF; endF<m_countF; ++endF) {
+
+ int minE = std::numeric_limits<int>::max();
+ int maxE = -1;
+ for (int fi=startF; fi<=endF; ++fi) {
+ for (size_t i=0; i<alignedToS[fi].size(); ++i) {
+ int ei = alignedToS[fi][i];
+ if (ei<minE) {
+ minE = ei;
+ }
+ if (ei>maxE) {
+ maxE = ei;
+ }
+ }
+ }
+
+ m_minAndMaxAlignedToSourceSpan[ std::pair<int,int>(startF,endF) ] = std::pair<int,int>(minE,maxE);
+ }
+ }
+
+ // check alignments for target phrase startE...endE
+ // loop over continuous phrases which are compatible with the word alignments
+ for (int startE=0; startE<m_countE; ++startE) {
+ for (int endE=startE; endE<m_countE; ++endE) {
+
+ int minF = std::numeric_limits<int>::max();
+ int maxF = -1;
+ std::vector< int > usedF = alignedCountS;
+ for (int ei=startE; ei<=endE; ++ei) {
+ for (size_t i=0; i<m_alignedToT[ei].size(); ++i) {
+ int fi = m_alignedToT[ei][i];
+ if (fi<minF) {
+ minF = fi;
+ }
+ if (fi>maxF) {
+ maxF = fi;
+ }
+ usedF[fi]--;
+ }
+ }
+
+ m_minAndMaxAlignedToTargetSpan[ std::pair<int,int>(startE,endE) ] = std::pair<int,int>(minF,maxF);
+
+ if (maxF >= 0) { // aligned to any source words at all
+
+ // check if source words are aligned to out of bounds target words
+ bool out_of_bounds = false;
+ for (int fi=minF; fi<=maxF && !out_of_bounds; ++fi)
+ if (usedF[fi]>0) {
+ // cout << "out of bounds: " << fi << "\n";
+ out_of_bounds = true;
+ }
+
+ // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
+ if (!out_of_bounds) {
+ // start point of source phrase may retreat over unaligned
+ for (int startF=minF;
+ (startF>=0 &&
+ (startF==minF || alignedCountS[startF]==0)); // unaligned
+ startF--) {
+ // end point of source phrase may advance over unaligned
+ for (int endF=maxF;
+ (endF<m_countF &&
+ (endF==maxF || alignedCountS[endF]==0)); // unaligned
+ endF++) { // at this point we have extracted a phrase
+
+ InsertPhraseVertices(m_topLeft, m_topRight, m_bottomLeft, m_bottomRight,
+ startF, startE, endF, endE);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+
+void PhraseOrientation::InsertVertex( HSentenceVertices & corners, int x, int y )
+{
+ std::set<int> tmp;
+ tmp.insert(x);
+ std::pair< HSentenceVertices::iterator, bool > ret = corners.insert( std::pair<int, std::set<int> > (y, tmp) );
+ if (ret.second == false) {
+ ret.first->second.insert(x);
+ }
+}
+
+
+void PhraseOrientation::InsertPhraseVertices(HSentenceVertices & topLeft,
+ HSentenceVertices & topRight,
+ HSentenceVertices & bottomLeft,
+ HSentenceVertices & bottomRight,
+ int startF, int startE, int endF, int endE)
+{
+
+ InsertVertex(topLeft, startF, startE);
+ InsertVertex(topRight, endF, startE);
+ InsertVertex(bottomLeft, startF, endE);
+ InsertVertex(bottomRight, endF, endE);
+}
+
+
+const std::string PhraseOrientation::GetOrientationInfoString(int startF, int endF, REO_DIR direction) const
+{
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
+ = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
+
+ if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
+ int startE = (foundMinMax->second).first;
+ int endE = (foundMinMax->second).second;
+// std::cerr << "Phrase orientation for"
+// << " startF=" << startF
+// << " endF=" << endF
+// << " startE=" << startE
+// << " endE=" << endE
+// << std::endl;
+ return GetOrientationInfoString(startF, startE, endF, endE, direction);
+ } else {
+ std::cerr << "PhraseOrientation::GetOrientationInfoString(): Error: not able to determine phrase orientation" << std::endl;
+ std::exit(1);
+ }
+}
+
+
+const std::string PhraseOrientation::GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction) const
+{
+ REO_CLASS hierPrevOrient=REO_CLASS_UNKNOWN, hierNextOrient=REO_CLASS_UNKNOWN;
+
+ if ( direction == REO_DIR_L2R || direction == REO_DIR_BIDIR )
+ hierPrevOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_L2R);
+
+ if ( direction == REO_DIR_R2L || direction == REO_DIR_BIDIR )
+ hierNextOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_R2L);
+
+ switch (direction) {
+ case REO_DIR_L2R:
+ return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR);
+ break;
+ case REO_DIR_R2L:
+ return GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
+ break;
+ case REO_DIR_BIDIR:
+ return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
+ break;
+ default:
+ return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
+ break;
+ }
+ return "PhraseOrientationERROR";
+}
+
+
+PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int endF, REO_DIR direction) const
+{
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
+ = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
+
+ if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
+ int startE = (foundMinMax->second).first;
+ int endE = (foundMinMax->second).second;
+// std::cerr << "Phrase orientation for"
+// << " startF=" << startF
+// << " endF=" << endF
+// << " startE=" << startE
+// << " endE=" << endE
+// << std::endl;
+ return GetOrientationInfo(startF, startE, endF, endE, direction);
+ } else {
+ std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: not able to determine phrase orientation" << std::endl;
+ std::exit(1);
+ }
+}
+
+
+PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const
+{
+ if ( direction != REO_DIR_L2R && direction != REO_DIR_R2L ) {
+ std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: direction should be either L2R or R2L" << std::endl;
+ std::exit(1);
+ }
+
+ if ( direction == REO_DIR_L2R )
+ return GetOrientHierModel(REO_MODEL_TYPE_MSLR,
+ startF, endF, startE, endE, m_countF-1, 0, 0, 1,
+ &ge, &le,
+ m_bottomRight, m_bottomLeft);
+
+ if ( direction == REO_DIR_R2L )
+ return GetOrientHierModel(REO_MODEL_TYPE_MSLR,
+ endF, startF, endE, startE, 0, m_countF-1, m_countE-1, -1,
+ &le, &ge,
+ m_topLeft, m_topRight);
+
+ return REO_CLASS_UNKNOWN;
+}
+
+
+// to be called with countF-1 instead of countF
+PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientHierModel(REO_MODEL_TYPE modelType,
+ int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit,
+ bool (*ge)(int, int), bool (*le)(int, int),
+ const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const
+{
+ bool leftSourceSpanIsAligned = ( (startF != zeroF) && SourceSpanIsAligned(zeroF,startF-unit) );
+ bool topTargetSpanIsAligned = ( (startE != zeroE) && TargetSpanIsAligned(zeroE,startE-unit) );
+
+ if (!topTargetSpanIsAligned && !leftSourceSpanIsAligned)
+ return REO_CLASS_LEFT;
+
+ HSentenceVertices::const_iterator it;
+
+ if (//(connectedLeftTop && !connectedRightTop) ||
+ ((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
+ it->second.find(startF-unit) != it->second.end()))
+ return REO_CLASS_LEFT;
+
+ if (modelType == REO_MODEL_TYPE_MONO)
+ return REO_CLASS_UNKNOWN;
+
+ if (//(!connectedLeftTop && connectedRightTop) ||
+ ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
+ it->second.find(endF + unit) != it->second.end()))
+ return REO_CLASS_RIGHT;
+
+ if (modelType == REO_MODEL_TYPE_MSD)
+ return REO_CLASS_UNKNOWN;
+
+ for (int indexF=startF-2*unit; (*ge)(indexF, zeroF); indexF=indexF-unit)
+ {
+ if ((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
+ it->second.find(indexF) != it->second.end())
+ return REO_CLASS_DLEFT;
+ }
+
+ for (int indexF=endF+2*unit; (*le)(indexF, countF); indexF=indexF+unit)
+ {
+ if ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
+ it->second.find(indexF) != it->second.end())
+ return REO_CLASS_DRIGHT;
+ }
+
+ return REO_CLASS_UNKNOWN;
+}
+
+bool PhraseOrientation::SourceSpanIsAligned(int index1, int index2) const
+{
+ return SpanIsAligned(index1, index2, m_minAndMaxAlignedToSourceSpan);
+}
+
+bool PhraseOrientation::TargetSpanIsAligned(int index1, int index2) const
+{
+ return SpanIsAligned(index1, index2, m_minAndMaxAlignedToTargetSpan);
+}
+
+bool PhraseOrientation::SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const
+{
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator itMinAndMaxAligned =
+ minAndMaxAligned.find(std::pair<int,int>(std::min(index1,index2),std::max(index1,index2)));
+
+ if (itMinAndMaxAligned == minAndMaxAligned.end())
+ {
+ std::cerr << "PhraseOrientation::SourceSpanIsAligned(): Error" << std::endl;
+ std::exit(1);
+ }
+ else
+ {
+ if (itMinAndMaxAligned->second.first == std::numeric_limits<int>::max())
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+
+const std::string PhraseOrientation::GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType)
+{
+ std::ostringstream oss;
+ WriteOrientation(oss, orient, modelType);
+ return oss.str();
+}
+
+
+void PhraseOrientation::WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType)
+{
+ switch(orient) {
+ case REO_CLASS_LEFT:
+ out << "mono";
+ break;
+ case REO_CLASS_RIGHT:
+ out << "swap";
+ break;
+ case REO_CLASS_DLEFT:
+ out << "dleft";
+ break;
+ case REO_CLASS_DRIGHT:
+ out << "dright";
+ break;
+ case REO_CLASS_UNKNOWN:
+ switch(modelType) {
+ case REO_MODEL_TYPE_MONO:
+ out << "nomono";
+ break;
+ case REO_MODEL_TYPE_MSD:
+ out << "other";
+ break;
+ case REO_MODEL_TYPE_MSLR:
+ out << "dleft";
+ break;
+ }
+ break;
+ }
+}
+
+
+bool PhraseOrientation::IsAligned(int fi, int ei) const
+{
+ if (ei == -1 && fi == -1)
+ return true;
+
+ if (ei <= -1 || fi <= -1)
+ return false;
+
+ if (ei == m_countE && fi == m_countF)
+ return true;
+
+ if (ei >= m_countE || fi >= m_countF)
+ return false;
+
+ for (size_t i=0; i<m_alignedToT[ei].size(); ++i)
+ if (m_alignedToT[ei][i] == fi)
+ return true;
+
+ return false;
+}
+
+
+void PhraseOrientation::IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment)
+{
+ assert(direction==REO_DIR_L2R || direction==REO_DIR_R2L);
+ if (direction == REO_DIR_L2R) {
+ m_l2rOrientationPriorCounts[orient] += increment;
+ } else if (direction == REO_DIR_R2L) {
+ m_r2lOrientationPriorCounts[orient] += increment;
+ }
+}
+
+
+void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType)
+{
+ std::map<std::string,float> l2rOrientationPriorCountsMap;
+ std::map<std::string,float> r2lOrientationPriorCountsMap;
+ for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) {
+ l2rOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_l2rOrientationPriorCounts[orient];
+ }
+ for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) {
+ r2lOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_r2lOrientationPriorCounts[orient];
+ }
+ for (std::map<std::string,float>::const_iterator l2rOrientationPriorCountsMapIt = l2rOrientationPriorCountsMap.begin();
+ l2rOrientationPriorCountsMapIt != l2rOrientationPriorCountsMap.end(); ++l2rOrientationPriorCountsMapIt) {
+ out << "L2R_" << l2rOrientationPriorCountsMapIt->first << " " << l2rOrientationPriorCountsMapIt->second << std::endl;
+ }
+ for (std::map<std::string,float>::const_iterator r2lOrientationPriorCountsMapIt = r2lOrientationPriorCountsMap.begin();
+ r2lOrientationPriorCountsMapIt != r2lOrientationPriorCountsMap.end(); ++r2lOrientationPriorCountsMapIt) {
+ out << "R2L_" << r2lOrientationPriorCountsMapIt->first << " " << r2lOrientationPriorCountsMapIt->second << std::endl;
+ }
+}
+
+} // namespace GHKM
+} // namespace Moses
+
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h
new file mode 100644
index 000000000..313c1f3df
--- /dev/null
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.h
@@ -0,0 +1,106 @@
+
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+
+#include "Alignment.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/unordered_map.hpp>
+
+namespace Moses
+{
+namespace GHKM
+{
+
+// The key of the map is the English index and the value is a set of the source ones
+typedef std::map <int, std::set<int> > HSentenceVertices;
+
+
+class PhraseOrientation
+{
+public:
+
+ enum REO_MODEL_TYPE {REO_MODEL_TYPE_MSD, REO_MODEL_TYPE_MSLR, REO_MODEL_TYPE_MONO};
+ enum REO_CLASS {REO_CLASS_LEFT, REO_CLASS_RIGHT, REO_CLASS_DLEFT, REO_CLASS_DRIGHT, REO_CLASS_UNKNOWN};
+ enum REO_DIR {REO_DIR_L2R, REO_DIR_R2L, REO_DIR_BIDIR};
+
+
+ PhraseOrientation(int sourceSize,
+ int targetSize,
+ const Alignment &alignment);
+
+ REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const;
+ REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const;
+ const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=REO_DIR_BIDIR) const;
+ const std::string GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction=REO_DIR_BIDIR) const;
+ static const std::string GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
+ static void WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
+ void IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment);
+ static void WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
+ bool SourceSpanIsAligned(int index1, int index2) const;
+ bool TargetSpanIsAligned(int index1, int index2) const;
+
+private:
+
+ void InsertVertex( HSentenceVertices & corners, int x, int y );
+
+ void InsertPhraseVertices(HSentenceVertices & topLeft,
+ HSentenceVertices & topRight,
+ HSentenceVertices & bottomLeft,
+ HSentenceVertices & bottomRight,
+ int startF, int startE, int endF, int endE);
+
+ REO_CLASS GetOrientHierModel(REO_MODEL_TYPE modelType,
+ int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit,
+ bool (*ge)(int, int), bool (*lt)(int, int),
+ const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const;
+
+ bool SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const;
+
+ bool IsAligned(int fi, int ei) const;
+
+ static bool ge(int first, int second) { return first >= second; };
+ static bool le(int first, int second) { return first <= second; };
+ static bool lt(int first, int second) { return first < second; };
+
+ const int m_countF;
+ const int m_countE;
+
+ std::vector<std::vector<int> > m_alignedToT;
+
+ HSentenceVertices m_topLeft;
+ HSentenceVertices m_topRight;
+ HSentenceVertices m_bottomLeft;
+ HSentenceVertices m_bottomRight;
+
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToSourceSpan;
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToTargetSpan;
+
+ static std::vector<float> m_l2rOrientationPriorCounts;
+ static std::vector<float> m_r2lOrientationPriorCounts;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
diff --git a/phrase-extract/extract-ghkm/Rule.cpp b/phrase-extract/extract-ghkm/Rule.cpp
new file mode 100644
index 000000000..da6b2ff23
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Rule.cpp
@@ -0,0 +1,41 @@
+#include "Rule.h"
+
+#include "Node.h"
+#include "Subgraph.h"
+
+namespace Moses
+{
+namespace GHKM
+{
+
+int Rule::Scope(const std::vector<Symbol> &symbols)
+{
+ int scope = 0;
+ bool predIsNonTerm = false;
+ if (symbols[0].GetType() == NonTerminal) {
+ ++scope;
+ predIsNonTerm = true;
+ }
+ for (std::size_t i = 1; i < symbols.size(); ++i) {
+ bool isNonTerm = symbols[i].GetType() == NonTerminal;
+ if (isNonTerm && predIsNonTerm) {
+ ++scope;
+ }
+ predIsNonTerm = isNonTerm;
+ }
+ if (predIsNonTerm) {
+ ++scope;
+ }
+ return scope;
+}
+
+bool Rule::PartitionOrderComp(const Node *a, const Node *b)
+{
+ const Span &aSpan = a->GetSpan();
+ const Span &bSpan = b->GetSpan();
+ assert(!aSpan.empty() && !bSpan.empty());
+ return *(aSpan.begin()) < *(bSpan.begin());
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/Rule.h b/phrase-extract/extract-ghkm/Rule.h
new file mode 100644
index 000000000..186cfda37
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Rule.h
@@ -0,0 +1,58 @@
+#pragma once
+#ifndef EXTRACT_GHKM_RULE_H_
+#define EXTRACT_GHKM_RULE_H_
+
+#include <string>
+#include <vector>
+
+#include "Alignment.h"
+
+namespace Moses
+{
+namespace GHKM
+{
+
+class Node;
+
+enum SymbolType { Terminal, NonTerminal };
+
+class Symbol {
+public:
+ Symbol(const std::string &v, SymbolType t) : m_value(v) , m_type(t) {}
+
+ const std::string &GetValue() const {
+ return m_value;
+ }
+ SymbolType GetType() const {
+ return m_type;
+ }
+
+private:
+ std::string m_value;
+ SymbolType m_type;
+};
+
+// Base class for ScfgRule and StsgRule.
+class Rule
+{
+public:
+ virtual ~Rule() {}
+
+ const Alignment &GetAlignment() const {
+ return m_alignment;
+ }
+
+ virtual int Scope() const = 0;
+
+protected:
+ static bool PartitionOrderComp(const Node *, const Node *);
+
+ static int Scope(const std::vector<Symbol>&);
+
+ Alignment m_alignment;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
index 2c901413d..af801d648 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -21,6 +21,7 @@
#include "Node.h"
#include "Subgraph.h"
+#include "SyntaxTree.h"
#include <algorithm>
@@ -29,11 +30,14 @@ namespace Moses
namespace GHKM
{
-ScfgRule::ScfgRule(const Subgraph &fragment)
+ScfgRule::ScfgRule(const Subgraph &fragment,
+ const MosesTraining::SyntaxTree *sourceSyntaxTree)
: m_sourceLHS("X", NonTerminal)
, m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
, m_pcfgScore(fragment.GetPcfgScore())
+ , m_hasSourceLabels(sourceSyntaxTree)
{
+
// Source RHS
const std::set<const Node *> &leaves = fragment.GetLeaves();
@@ -55,6 +59,7 @@ ScfgRule::ScfgRule(const Subgraph &fragment)
std::map<const Node *, std::vector<int> > sourceOrder;
m_sourceRHS.reserve(sourceRHSNodes.size());
+ m_numberOfNonTerminals = 0;
int srcIndex = 0;
for (std::vector<const Node *>::const_iterator p(sourceRHSNodes.begin());
p != sourceRHSNodes.end(); ++p, ++srcIndex) {
@@ -62,6 +67,11 @@ ScfgRule::ScfgRule(const Subgraph &fragment)
if (sinkNode.GetType() == TREE) {
m_sourceRHS.push_back(Symbol("X", NonTerminal));
sourceOrder[&sinkNode].push_back(srcIndex);
+ ++m_numberOfNonTerminals;
+ if (sourceSyntaxTree) {
+ // Source syntax label
+ PushSourceLabel(sourceSyntaxTree,&sinkNode,"XRHS");
+ }
} else {
assert(sinkNode.GetType() == SOURCE);
m_sourceRHS.push_back(Symbol(sinkNode.GetLabel(), Terminal));
@@ -112,35 +122,76 @@ ScfgRule::ScfgRule(const Subgraph &fragment)
}
}
}
+
+ if (sourceSyntaxTree) {
+ // Source syntax label for root node (if sourceSyntaxTree available)
+ PushSourceLabel(sourceSyntaxTree,fragment.GetRoot(),"XLHS");
+ // All non-terminal spans (including the LHS) should have obtained a label
+ // (a source-side syntactic constituent label if the span matches, "XLHS" otherwise)
+ assert(m_sourceLabels.size() == m_numberOfNonTerminals+1);
+ }
}
-int ScfgRule::Scope() const
+void ScfgRule::PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
+ const Node *node,
+ const std::string &nonMatchingLabel)
{
- int scope = 0;
- bool predIsNonTerm = false;
- if (m_sourceRHS[0].GetType() == NonTerminal) {
- ++scope;
- predIsNonTerm = true;
- }
- for (size_t i = 1; i < m_sourceRHS.size(); ++i) {
- bool isNonTerm = m_sourceRHS[i].GetType() == NonTerminal;
- if (isNonTerm && predIsNonTerm) {
- ++scope;
- }
- predIsNonTerm = isNonTerm;
- }
- if (predIsNonTerm) {
- ++scope;
+ ContiguousSpan span = Closure(node->GetSpan());
+ if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span?
+ std::vector<MosesTraining::SyntaxNode*> sourceLabels =
+ sourceSyntaxTree->GetNodes(span.first,span.second);
+ if (!sourceLabels.empty()) {
+ // store the topmost matching label from the source syntax tree
+ m_sourceLabels.push_back(sourceLabels.back()->GetLabel());
+ }
+ } else {
+ // no matching source-side syntactic constituent: store nonMatchingLabel
+ m_sourceLabels.push_back(nonMatchingLabel);
}
- return scope;
}
-bool ScfgRule::PartitionOrderComp(const Node *a, const Node *b)
+// TODO: rather implement the method external to ScfgRule
+void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::string,float>* > &coocCounts, float count) const
{
- const Span &aSpan = a->GetSpan();
- const Span &bSpan = b->GetSpan();
- assert(!aSpan.empty() && !bSpan.empty());
- return *(aSpan.begin()) < *(bSpan.begin());
+ std::map<int, int> sourceToTargetNTMap;
+ std::map<int, int> targetToSourceNTMap;
+
+ for (Alignment::const_iterator p(m_alignment.begin());
+ p != m_alignment.end(); ++p) {
+ if ( m_sourceRHS[p->first].GetType() == NonTerminal ) {
+ assert(m_targetRHS[p->second].GetType() == NonTerminal);
+ sourceToTargetNTMap[p->first] = p->second;
+ }
+ }
+
+ size_t sourceIndex = 0;
+ size_t sourceNonTerminalIndex = 0;
+ for (std::vector<Symbol>::const_iterator p=m_sourceRHS.begin();
+ p != m_sourceRHS.end(); ++p, ++sourceIndex) {
+ if ( p->GetType() == NonTerminal ) {
+ const std::string &sourceLabel = m_sourceLabels[sourceNonTerminalIndex];
+ int targetIndex = sourceToTargetNTMap[sourceIndex];
+ const std::string &targetLabel = m_targetRHS[targetIndex].GetValue();
+ ++sourceNonTerminalIndex;
+
+ std::map<std::string,float>* countMap = NULL;
+ std::map< std::string, std::map<std::string,float>* >::iterator iter = coocCounts.find(sourceLabel);
+ if ( iter == coocCounts.end() ) {
+ std::map<std::string,float> *newCountMap = new std::map<std::string,float>();
+ std::pair< std::map< std::string, std::map<std::string,float>* >::iterator, bool > inserted =
+ coocCounts.insert( std::pair< std::string, std::map<std::string,float>* >(sourceLabel, newCountMap) );
+ assert(inserted.second);
+ countMap = (inserted.first)->second;
+ } else {
+ countMap = iter->second;
+ }
+ std::pair< std::map<std::string,float>::iterator, bool > inserted =
+ countMap->insert( std::pair< std::string,float>(targetLabel, count) );
+ if ( !inserted.second ) {
+ (inserted.first)->second += count;
+ }
+ }
+ }
}
} // namespace GHKM
diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h
index 21a9e9900..6b8abb94e 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@@ -22,9 +22,14 @@
#define EXTRACT_GHKM_SCFG_RULE_H_
#include "Alignment.h"
+#include "Rule.h"
+#include "SyntaxTree.h"
#include <string>
#include <vector>
+#include <list>
+#include <memory>
+#include <iostream>
namespace Moses
{
@@ -34,28 +39,11 @@ namespace GHKM
class Node;
class Subgraph;
-enum SymbolType { Terminal, NonTerminal };
-
-struct Symbol {
-public:
- Symbol(const std::string &v, SymbolType t) : m_value(v) , m_type(t) {}
-
- const std::string &GetValue() const {
- return m_value;
- }
- SymbolType GetType() const {
- return m_type;
- }
-
-private:
- std::string m_value;
- SymbolType m_type;
-};
-
-class ScfgRule
+class ScfgRule : public Rule
{
public:
- ScfgRule(const Subgraph &fragment);
+ ScfgRule(const Subgraph &fragment,
+ const MosesTraining::SyntaxTree *sourceSyntaxTree = 0);
const Symbol &GetSourceLHS() const {
return m_sourceLHS;
@@ -69,24 +57,38 @@ public:
const std::vector<Symbol> &GetTargetRHS() const {
return m_targetRHS;
}
- const Alignment &GetAlignment() const {
- return m_alignment;
- }
float GetPcfgScore() const {
return m_pcfgScore;
}
+ bool HasSourceLabels() const {
+ return m_hasSourceLabels;
+ }
+ void PrintSourceLabels(std::ostream &out) const {
+ for (std::vector<std::string>::const_iterator it = m_sourceLabels.begin();
+ it != m_sourceLabels.end(); ++it) {
+ out << " " << (*it);
+ }
+ }
+ void UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::string,float>* > &coocCounts,
+ float count) const;
- int Scope() const;
+ int Scope() const {
+ return Rule::Scope(m_sourceRHS);
+ }
private:
- static bool PartitionOrderComp(const Node *, const Node *);
+ void PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
+ const Node *node,
+ const std::string &nonMatchingLabel);
Symbol m_sourceLHS;
Symbol m_targetLHS;
std::vector<Symbol> m_sourceRHS;
std::vector<Symbol> m_targetRHS;
- Alignment m_alignment;
float m_pcfgScore;
+ bool m_hasSourceLabels;
+ std::vector<std::string> m_sourceLabels;
+ unsigned m_numberOfNonTerminals;
};
} // namespace GHKM
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index aea984ae0..d306b845f 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -47,14 +47,26 @@ void ScfgRuleWriter::Write(const ScfgRule &rule, bool printEndl)
}
// Write the rule to the forward and inverse extract files.
- m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
- m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+ if (m_options.t2s) {
+ // If model is tree-to-string then flip the source and target.
+ m_fwd << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+ m_inv << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+ } else {
+ m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+ m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+ }
const Alignment &alignment = rule.GetAlignment();
for (Alignment::const_iterator p = alignment.begin();
p != alignment.end(); ++p) {
- m_fwd << " " << p->first << "-" << p->second;
- m_inv << " " << p->second << "-" << p->first;
+ if (m_options.t2s) {
+ // If model is tree-to-string then flip the source and target.
+ m_fwd << " " << p->second << "-" << p->first;
+ m_inv << " " << p->first << "-" << p->second;
+ } else {
+ m_fwd << " " << p->first << "-" << p->second;
+ m_inv << " " << p->second << "-" << p->first;
+ }
}
// Write a count of 1.
@@ -66,6 +78,14 @@ void ScfgRuleWriter::Write(const ScfgRule &rule, bool printEndl)
m_fwd << " ||| " << std::exp(rule.GetPcfgScore());
}
+ m_fwd << " |||";
+
+ if (m_options.sourceLabels && rule.HasSourceLabels()) {
+ m_fwd << " {{SourceLabels";
+ rule.PrintSourceLabels(m_fwd);
+ m_fwd << "}}";
+ }
+
if (printEndl) {
m_fwd << std::endl;
m_inv << std::endl;
@@ -163,14 +183,17 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
}
}
-void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g)
+void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g, bool printEndl)
{
Write(rule,false);
m_fwd << " {{Tree ";
g.PrintTree(m_fwd);
m_fwd << "}}";
- m_fwd << std::endl;
- m_inv << std::endl;
+
+ if (printEndl) {
+ m_fwd << std::endl;
+ m_inv << std::endl;
+ }
}
} // namespace GHKM
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index 01883cdff..240492824 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -32,7 +32,7 @@ namespace GHKM
struct Options;
class ScfgRule;
-struct Symbol;
+class Symbol;
class ScfgRuleWriter
{
@@ -44,7 +44,7 @@ public:
void Write(const ScfgRule &rule, bool printEndl=true);
- void Write(const ScfgRule &rule, const Subgraph &g);
+ void Write(const ScfgRule &rule, const Subgraph &g, bool printEndl=true);
private:
// Disallow copying
diff --git a/phrase-extract/extract-ghkm/StsgRule.cpp b/phrase-extract/extract-ghkm/StsgRule.cpp
new file mode 100644
index 000000000..a6100ac96
--- /dev/null
+++ b/phrase-extract/extract-ghkm/StsgRule.cpp
@@ -0,0 +1,95 @@
+#include "StsgRule.h"
+
+#include "Node.h"
+#include "Subgraph.h"
+#include "SyntaxTree.h"
+
+#include <algorithm>
+
+namespace Moses
+{
+namespace GHKM
+{
+
+StsgRule::StsgRule(const Subgraph &fragment)
+ : m_targetSide(fragment, true)
+{
+ // Source side
+
+ const std::set<const Node *> &sinkNodes = fragment.GetLeaves();
+
+ // Collect the subset of sink nodes that excludes target nodes with
+ // empty spans.
+ std::vector<const Node *> productiveSinks;
+ productiveSinks.reserve(sinkNodes.size());
+ for (std::set<const Node *>::const_iterator p = sinkNodes.begin();
+ p != sinkNodes.end(); ++p) {
+ const Node *sink = *p;
+ if (!sink->GetSpan().empty()) {
+ productiveSinks.push_back(sink);
+ }
+ }
+
+ // Sort them into the order defined by their spans.
+ std::sort(productiveSinks.begin(), productiveSinks.end(), PartitionOrderComp);
+
+ // Build a map from target nodes to source-order indices, so that we
+ // can construct the Alignment object later.
+ std::map<const Node *, std::vector<int> > sinkToSourceIndices;
+ std::map<const Node *, int> nonTermSinkToSourceIndex;
+
+ m_sourceSide.reserve(productiveSinks.size());
+ int srcIndex = 0;
+ int nonTermCount = 0;
+ for (std::vector<const Node *>::const_iterator p = productiveSinks.begin();
+ p != productiveSinks.end(); ++p, ++srcIndex) {
+ const Node &sink = **p;
+ if (sink.GetType() == TREE) {
+ m_sourceSide.push_back(Symbol("X", NonTerminal));
+ sinkToSourceIndices[&sink].push_back(srcIndex);
+ nonTermSinkToSourceIndex[&sink] = nonTermCount++;
+ } else {
+ assert(sink.GetType() == SOURCE);
+ m_sourceSide.push_back(Symbol(sink.GetLabel(), Terminal));
+ // Add all aligned target words to the sinkToSourceIndices map
+ const std::vector<Node *> &parents(sink.GetParents());
+ for (std::vector<Node *>::const_iterator q = parents.begin();
+ q != parents.end(); ++q) {
+ if ((*q)->GetType() == TARGET) {
+ sinkToSourceIndices[*q].push_back(srcIndex);
+ }
+ }
+ }
+ }
+
+ // Alignment
+
+ std::vector<const Node *> targetLeaves;
+ m_targetSide.GetTargetLeaves(targetLeaves);
+
+ m_alignment.reserve(targetLeaves.size());
+ m_nonTermAlignment.resize(nonTermCount);
+
+ for (int i = 0, j = 0; i < targetLeaves.size(); ++i) {
+ const Node *leaf = targetLeaves[i];
+ assert(leaf->GetType() != SOURCE);
+ if (leaf->GetSpan().empty()) {
+ continue;
+ }
+ std::map<const Node *, std::vector<int> >::iterator p =
+ sinkToSourceIndices.find(leaf);
+ assert(p != sinkToSourceIndices.end());
+ std::vector<int> &sourceNodes = p->second;
+ for (std::vector<int>::iterator r = sourceNodes.begin();
+ r != sourceNodes.end(); ++r) {
+ int srcIndex = *r;
+ m_alignment.push_back(std::make_pair(srcIndex, i));
+ }
+ if (leaf->GetType() == TREE) {
+ m_nonTermAlignment[nonTermSinkToSourceIndex[leaf]] = j++;
+ }
+ }
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/StsgRule.h b/phrase-extract/extract-ghkm/StsgRule.h
new file mode 100644
index 000000000..b14695c5c
--- /dev/null
+++ b/phrase-extract/extract-ghkm/StsgRule.h
@@ -0,0 +1,44 @@
+#pragma once
+#ifndef EXTRACT_GHKM_STSG_RULE_H_
+#define EXTRACT_GHKM_STSG_RULE_H_
+
+#include "Rule.h"
+#include "Subgraph.h"
+
+#include <vector>
+
+namespace Moses
+{
+namespace GHKM
+{
+
+class Node;
+
+class StsgRule : public Rule
+{
+public:
+ StsgRule(const Subgraph &fragment);
+
+ const std::vector<Symbol> &GetSourceSide() const {
+ return m_sourceSide;
+ }
+ const Subgraph &GetTargetSide() const {
+ return m_targetSide;
+ }
+ const std::vector<int> &GetNonTermAlignment() const {
+ return m_nonTermAlignment;
+ }
+ int Scope() const {
+ return Rule::Scope(m_sourceSide);
+ }
+
+private:
+ std::vector<Symbol> m_sourceSide;
+ Subgraph m_targetSide;
+ std::vector<int> m_nonTermAlignment;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/StsgRuleWriter.cpp b/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
new file mode 100644
index 000000000..a9596b65c
--- /dev/null
+++ b/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
@@ -0,0 +1,95 @@
+#include "StsgRuleWriter.h"
+
+#include "Alignment.h"
+#include "Options.h"
+#include "StsgRule.h"
+
+#include <cassert>
+#include <cmath>
+#include <ostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+namespace Moses
+{
+namespace GHKM
+{
+
+void StsgRuleWriter::Write(const StsgRule &rule)
+{
+ std::ostringstream sourceSS;
+ std::ostringstream targetSS;
+
+ // Write the source side of the rule to sourceSS.
+ const std::vector<Symbol> &sourceSide = rule.GetSourceSide();
+ for (std::size_t i = 0; i < sourceSide.size(); ++i) {
+ const Symbol &symbol = sourceSide[i];
+ if (i > 0) {
+ sourceSS << " ";
+ }
+ if (symbol.GetType() == NonTerminal) {
+ sourceSS << "[X]";
+ } else {
+ sourceSS << symbol.GetValue();
+ }
+ }
+
+ // Write the target side of the rule to targetSS.
+ rule.GetTargetSide().PrintTree(targetSS);
+
+ // Write the rule to the forward and inverse extract files.
+ if (m_options.t2s) {
+ // If model is tree-to-string then flip the source and target.
+ m_fwd << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+ m_inv << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+ } else {
+ m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+ m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+ }
+
+ // Write the non-terminal alignments.
+ const std::vector<int> &nonTermAlignment = rule.GetNonTermAlignment();
+ for (int srcIndex = 0; srcIndex < nonTermAlignment.size(); ++srcIndex) {
+ int tgtIndex = nonTermAlignment[srcIndex];
+ if (m_options.t2s) {
+ // If model is tree-to-string then flip the source and target.
+ m_fwd << " " << tgtIndex << "-" << srcIndex;
+ m_inv << " " << srcIndex << "-" << tgtIndex;
+ } else {
+ m_fwd << " " << srcIndex << "-" << tgtIndex;
+ m_inv << " " << tgtIndex << "-" << srcIndex;
+ }
+ }
+ m_fwd << " |||";
+ m_inv << " |||";
+
+ // Write the symbol alignments.
+ const Alignment &alignment = rule.GetAlignment();
+ for (Alignment::const_iterator p = alignment.begin();
+ p != alignment.end(); ++p) {
+ if (m_options.t2s) {
+ // If model is tree-to-string then flip the source and target.
+ m_fwd << " " << p->second << "-" << p->first;
+ m_inv << " " << p->first << "-" << p->second;
+ } else {
+ m_fwd << " " << p->first << "-" << p->second;
+ m_inv << " " << p->second << "-" << p->first;
+ }
+ }
+
+ // Write a count of 1.
+ m_fwd << " ||| 1";
+ m_inv << " ||| 1";
+
+ // Write the PCFG score (if requested).
+ if (m_options.pcfg) {
+ m_fwd << " ||| " << std::exp(rule.GetTargetSide().GetPcfgScore());
+ }
+
+ m_fwd << std::endl;
+ m_inv << std::endl;
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/StsgRuleWriter.h b/phrase-extract/extract-ghkm/StsgRuleWriter.h
new file mode 100644
index 000000000..efba44d2c
--- /dev/null
+++ b/phrase-extract/extract-ghkm/StsgRuleWriter.h
@@ -0,0 +1,41 @@
+#pragma once
+#ifndef EXTRACT_GHKM_STSG_RULE_WRITER_H_
+#define EXTRACT_GHKM_STSG_RULE_WRITER_H_
+
+#include "Subgraph.h"
+
+#include <ostream>
+
+namespace Moses
+{
+namespace GHKM
+{
+
+struct Options;
+class StsgRule;
+class Symbol;
+
+class StsgRuleWriter
+{
+public:
+ StsgRuleWriter(std::ostream &fwd, std::ostream &inv, const Options &options)
+ : m_fwd(fwd)
+ , m_inv(inv)
+ , m_options(options) {}
+
+ void Write(const StsgRule &rule);
+
+private:
+ // Disallow copying
+ StsgRuleWriter(const StsgRuleWriter &);
+ StsgRuleWriter &operator=(const StsgRuleWriter &);
+
+ std::ostream &m_fwd;
+ std::ostream &m_inv;
+ const Options &m_options;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/Subgraph.h b/phrase-extract/extract-ghkm/Subgraph.h
index 1ea2ea387..b02404beb 100644
--- a/phrase-extract/extract-ghkm/Subgraph.h
+++ b/phrase-extract/extract-ghkm/Subgraph.h
@@ -56,6 +56,42 @@ public:
m_pcfgScore = CalcPcfgScore();
}
+ Subgraph(const Subgraph &other, bool targetOnly=false)
+ : m_root(other.m_root)
+ , m_leaves(other.m_leaves)
+ , m_depth(other.m_depth)
+ , m_size(other.m_size)
+ , m_nodeCount(other.m_nodeCount)
+ , m_pcfgScore(other.m_pcfgScore) {
+ if (targetOnly && m_root->GetType() != SOURCE) {
+ // Replace any source-word sink nodes with their parents (except for
+ // the special case where the parent is a non-word tree node -- see
+ // below).
+ std::set<const Node *> targetLeaves;
+ for (std::set<const Node *>::const_iterator p = m_leaves.begin();
+ p != m_leaves.end(); ++p) {
+ const Node *leaf = *p;
+ if (leaf->GetType() != SOURCE) {
+ targetLeaves.insert(leaf);
+ } else {
+ const std::vector<Node*> &parents = leaf->GetParents();
+ for (std::vector<Node*>::const_iterator q = parents.begin();
+ q != parents.end(); ++q) {
+ const Node *parent = *q;
+ // Only add parents that are words, not tree nodes since those
+ // are never sink nodes. (A source word can have a tree node as
+ // its parent due to the heuristic for handling unaligned source
+ // words).
+ if (parent->GetType() == TARGET) {
+ targetLeaves.insert(*q);
+ }
+ }
+ }
+ }
+ m_leaves.swap(targetLeaves);
+ }
+ }
+
const Node *GetRoot() const {
return m_root;
}
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
index d00fd7d9f..d0209254f 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -45,9 +45,13 @@ class XmlTreeParser
public:
XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
std::auto_ptr<ParseTree> Parse(const std::string &);
+
+ static std::auto_ptr<ParseTree> ConvertTree(const MosesTraining::SyntaxNode &,
+ const std::vector<std::string> &);
+
+ const std::vector<std::string>& GetWords() { return m_words; };
+
private:
- std::auto_ptr<ParseTree> ConvertTree(const MosesTraining::SyntaxNode &,
- const std::vector<std::string> &);
std::set<std::string> &m_labelSet;
std::map<std::string, int> &m_topLabelSet;
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index 76f695d2a..a7ec0ac92 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -18,8 +18,8 @@
#include <map>
#include <set>
#include <vector>
+#include <limits>
-#include "SafeGetline.h"
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "InputFileStream.h"
@@ -32,10 +32,6 @@ using namespace MosesTraining;
namespace MosesTraining
{
-
-const long int LINE_MAX_LENGTH = 500000 ;
-
-
// HPhraseVertex represents a point in the alignment matrix
typedef pair <int, int> HPhraseVertex;
@@ -277,20 +273,18 @@ int main(int argc, char* argv[])
int i = sentenceOffset;
- while(true) {
+ string englishString, foreignString, alignmentString, weightString;
+
+ while(getline(*eFileP, englishString)) {
i++;
if (i%10000 == 0) cerr << "." << flush;
- char englishString[LINE_MAX_LENGTH];
- char foreignString[LINE_MAX_LENGTH];
- char alignmentString[LINE_MAX_LENGTH];
- char weightString[LINE_MAX_LENGTH];
- SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
- if (eFileP->eof()) break;
- SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
- SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
+
+ getline(*fFileP, foreignString);
+ getline(*aFileP, alignmentString);
if (iwFileP) {
- SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
+ getline(*iwFileP, weightString);
}
+
SentenceAlignment sentence;
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
@@ -300,7 +294,11 @@ int main(int argc, char* argv[])
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
+ if (sentence.create( englishString.c_str(),
+ foreignString.c_str(),
+ alignmentString.c_str(),
+ weightString.c_str(),
+ i, false)) {
if (options.placeholders.size()) {
sentence.invertAlignment();
}
@@ -366,8 +364,6 @@ void ExtractTask::extract(SentenceAlignment &sentence)
HSentenceVertices outBottomLeft;
HSentenceVertices outBottomRight;
- HSentenceVertices::const_iterator it;
-
bool relaxLimit = m_options.isHierModel();
bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
@@ -378,7 +374,7 @@ void ExtractTask::extract(SentenceAlignment &sentence)
(endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
endE++) {
- int minF = 9999;
+ int minF = std::numeric_limits<int>::max();
int maxF = -1;
vector< int > usedF = sentence.alignedCountS;
for(int ei=startE; ei<=endE; ei++) {
diff --git a/phrase-extract/extract-mixed-syntax/AlignedSentence.cpp b/phrase-extract/extract-mixed-syntax/AlignedSentence.cpp
new file mode 100644
index 000000000..082878c00
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/AlignedSentence.cpp
@@ -0,0 +1,194 @@
+/*
+ * AlignedSentence.cpp
+ *
+ * Created on: 18 Feb 2014
+ * Author: s0565741
+ */
+
+#include <sstream>
+#include "moses/Util.h"
+#include "AlignedSentence.h"
+#include "Parameter.h"
+
+using namespace std;
+
+
+/////////////////////////////////////////////////////////////////////////////////
+AlignedSentence::AlignedSentence(int lineNum,
+ const std::string &source,
+ const std::string &target,
+ const std::string &alignment)
+:m_lineNum(lineNum)
+{
+ PopulateWordVec(m_source, source);
+ PopulateWordVec(m_target, target);
+ PopulateAlignment(alignment);
+}
+
+AlignedSentence::~AlignedSentence() {
+ Moses::RemoveAllInColl(m_source);
+ Moses::RemoveAllInColl(m_target);
+}
+
+void AlignedSentence::PopulateWordVec(Phrase &vec, const std::string &line)
+{
+ std::vector<string> toks;
+ Moses::Tokenize(toks, line);
+
+ vec.resize(toks.size());
+ for (size_t i = 0; i < vec.size(); ++i) {
+ const string &tok = toks[i];
+ Word *word = new Word(i, tok);
+ vec[i] = word;
+ }
+}
+
+void AlignedSentence::PopulateAlignment(const std::string &line)
+{
+ vector<string> alignStr;
+ Moses::Tokenize(alignStr, line);
+
+ for (size_t i = 0; i < alignStr.size(); ++i) {
+ vector<int> alignPair;
+ Moses::Tokenize(alignPair, alignStr[i], "-");
+ assert(alignPair.size() == 2);
+
+ int sourcePos = alignPair[0];
+ int targetPos = alignPair[1];
+
+ if (sourcePos >= m_source.size()) {
+ cerr << "ERROR1:AlignedSentence=" << Debug() << endl;
+ cerr << "m_source=" << m_source.size() << endl;
+ abort();
+ }
+ assert(sourcePos < m_source.size());
+ assert(targetPos < m_target.size());
+ Word *sourceWord = m_source[sourcePos];
+ Word *targetWord = m_target[targetPos];
+
+ sourceWord->AddAlignment(targetWord);
+ targetWord->AddAlignment(sourceWord);
+ }
+}
+
+std::string AlignedSentence::Debug() const
+{
+ stringstream out;
+ out << "m_lineNum:";
+ out << m_lineNum;
+ out << endl;
+
+ out << "m_source:";
+ out << m_source.Debug();
+ out << endl;
+
+ out << "m_target:";
+ out << m_target.Debug();
+ out << endl;
+
+ out << "consistent phrases:" << endl;
+ out << m_consistentPhrases.Debug();
+ out << endl;
+
+ return out.str();
+}
+
+std::vector<int> AlignedSentence::GetSourceAlignmentCount() const
+{
+ vector<int> ret(m_source.size());
+
+ for (size_t i = 0; i < m_source.size(); ++i) {
+ const Word &word = *m_source[i];
+ ret[i] = word.GetAlignmentIndex().size();
+ }
+ return ret;
+}
+
+void AlignedSentence::Create(const Parameter &params)
+{
+ CreateConsistentPhrases(params);
+ m_consistentPhrases.AddHieroNonTerms(params);
+}
+
+void AlignedSentence::CreateConsistentPhrases(const Parameter &params)
+{
+ int countT = m_target.size();
+ int countS = m_source.size();
+
+ m_consistentPhrases.Initialize(countS);
+
+ // check alignments for target phrase startT...endT
+ for(int lengthT=1;
+ lengthT <= params.maxSpan && lengthT <= countT;
+ lengthT++) {
+ for(int startT=0; startT < countT-(lengthT-1); startT++) {
+
+ // that's nice to have
+ int endT = startT + lengthT - 1;
+
+ // find find aligned source words
+ // first: find minimum and maximum source word
+ int minS = 9999;
+ int maxS = -1;
+ vector< int > usedS = GetSourceAlignmentCount();
+ for(int ti=startT; ti<=endT; ti++) {
+ const Word &word = *m_target[ti];
+ const std::set<int> &alignment = word.GetAlignmentIndex();
+
+ std::set<int>::const_iterator iterAlign;
+ for(iterAlign = alignment.begin(); iterAlign != alignment.end(); ++iterAlign) {
+ int si = *iterAlign;
+ if (si<minS) {
+ minS = si;
+ }
+ if (si>maxS) {
+ maxS = si;
+ }
+ usedS[ si ]--;
+ }
+ }
+
+ // unaligned phrases are not allowed
+ if( maxS == -1 )
+ continue;
+
+ // source phrase has to be within limits
+ size_t width = maxS - minS + 1;
+
+ if( width < params.minSpan )
+ continue;
+
+ if( width > params.maxSpan )
+ continue;
+
+ // check if source words are aligned to out of bound target words
+ bool out_of_bounds = false;
+ for(int si=minS; si<=maxS && !out_of_bounds; si++)
+ if (usedS[si]>0) {
+ out_of_bounds = true;
+ }
+
+ // if out of bound, you gotta go
+ if (out_of_bounds)
+ continue;
+
+ // done with all the checks, lets go over all consistent phrase pairs
+ // start point of source phrase may retreat over unaligned
+ for(int startS=minS;
+ (startS>=0 &&
+ startS>maxS - params.maxSpan && // within length limit
+ (startS==minS || m_source[startS]->GetAlignment().size()==0)); // unaligned
+ startS--) {
+ // end point of source phrase may advance over unaligned
+ for(int endS=maxS;
+ (endS<countS && endS<startS + params.maxSpan && // within length limit
+ (endS==maxS || m_source[endS]->GetAlignment().size()==0)); // unaligned
+ endS++) {
+
+ // take note that this is a valid phrase alignment
+ m_consistentPhrases.Add(startS, endS, startT, endT, params);
+ }
+ }
+ }
+ }
+}
diff --git a/phrase-extract/extract-mixed-syntax/AlignedSentence.h b/phrase-extract/extract-mixed-syntax/AlignedSentence.h
new file mode 100644
index 000000000..915bdf90c
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/AlignedSentence.h
@@ -0,0 +1,51 @@
+/*
+ * AlignedSentence.h
+ *
+ * Created on: 18 Feb 2014
+ * Author: s0565741
+ */
+#pragma once
+
+#include <string>
+#include <set>
+#include "ConsistentPhrases.h"
+#include "Phrase.h"
+#include "moses/TypeDef.h"
+
+class Parameter;
+
+class AlignedSentence {
+public:
+ AlignedSentence(int lineNum)
+ :m_lineNum(lineNum)
+ {}
+
+ AlignedSentence(int lineNum,
+ const std::string &source,
+ const std::string &target,
+ const std::string &alignment);
+ virtual ~AlignedSentence();
+ virtual void Create(const Parameter &params);
+
+ const Phrase &GetPhrase(Moses::FactorDirection direction) const
+ { return (direction == Moses::Input) ? m_source : m_target; }
+
+ const ConsistentPhrases &GetConsistentPhrases() const
+ { return m_consistentPhrases; }
+
+ virtual std::string Debug() const;
+
+ int m_lineNum;
+protected:
+ Phrase m_source, m_target;
+ ConsistentPhrases m_consistentPhrases;
+
+ void CreateConsistentPhrases(const Parameter &params);
+ void PopulateWordVec(Phrase &vec, const std::string &line);
+
+ // m_source and m_target MUST be populated before calling this
+ void PopulateAlignment(const std::string &line);
+ std::vector<int> GetSourceAlignmentCount() const;
+};
+
+
diff --git a/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp b/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp
new file mode 100644
index 000000000..3d63ed044
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp
@@ -0,0 +1,183 @@
+/*
+ * AlignedSentenceSyntax.cpp
+ *
+ * Created on: 26 Feb 2014
+ * Author: hieu
+ */
+
+#include "AlignedSentenceSyntax.h"
+#include "Parameter.h"
+#include "pugixml.hpp"
+#include "moses/Util.h"
+
+using namespace std;
+
+AlignedSentenceSyntax::AlignedSentenceSyntax(int lineNum,
+ const std::string &source,
+ const std::string &target,
+ const std::string &alignment)
+:AlignedSentence(lineNum)
+,m_sourceStr(source)
+,m_targetStr(target)
+,m_alignmentStr(alignment)
+{
+}
+
+AlignedSentenceSyntax::~AlignedSentenceSyntax() {
+ // TODO Auto-generated destructor stub
+}
+
+void AlignedSentenceSyntax::Populate(bool isSyntax, int mixedSyntaxType, const Parameter &params,
+ string line, Phrase &phrase, SyntaxTree &tree)
+{
+ // parse source and target string
+ if (isSyntax) {
+ line = "<xml><tree label=\"X\">" + line + "</tree></xml>";
+ XMLParse(phrase, tree, line, params);
+
+ if (mixedSyntaxType != 0) {
+ // mixed syntax. Always add [X] where there isn't 1
+ tree.SetHieroLabel(params.hieroNonTerm);
+ if (mixedSyntaxType == 2) {
+ tree.AddToAll(params.hieroNonTerm);
+ }
+ }
+ }
+ else {
+ PopulateWordVec(phrase, line);
+ tree.SetHieroLabel(params.hieroNonTerm);
+ }
+
+}
+
+void AlignedSentenceSyntax::Create(const Parameter &params)
+{
+ Populate(params.sourceSyntax, params.mixedSyntaxType, params, m_sourceStr,
+ m_source, m_sourceTree);
+ Populate(params.targetSyntax, params.mixedSyntaxType, params, m_targetStr,
+ m_target, m_targetTree);
+
+ PopulateAlignment(m_alignmentStr);
+ CreateConsistentPhrases(params);
+
+ // create labels
+ CreateNonTerms();
+}
+
+void Escape(string &text)
+{
+ text = Moses::Replace(text, "&", "&amp;");
+ text = Moses::Replace(text, "|", "&#124;");
+ text = Moses::Replace(text, "<", "&lt;");
+ text = Moses::Replace(text, ">", "&gt;");
+ text = Moses::Replace(text, "'", "&apos;");
+ text = Moses::Replace(text, "\"", "&quot;");
+ text = Moses::Replace(text, "[", "&#91;");
+ text = Moses::Replace(text, "]", "&#93;");
+
+}
+
+void AlignedSentenceSyntax::XMLParse(Phrase &output,
+ SyntaxTree &tree,
+ const pugi::xml_node &parentNode,
+ const Parameter &params)
+{
+ int childNum = 0;
+ for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling())
+ {
+ string nodeName = childNode.name();
+
+ // span label
+ string label;
+ int startPos = output.size();
+
+ if (!nodeName.empty()) {
+ pugi::xml_attribute attribute = childNode.attribute("label");
+ label = attribute.as_string();
+
+ // recursively call this function. For proper recursive trees
+ XMLParse(output, tree, childNode, params);
+ }
+
+
+
+ // fill phrase vector
+ string text = childNode.value();
+ Escape(text);
+ //cerr << childNum << " " << label << "=" << text << endl;
+
+ std::vector<string> toks;
+ Moses::Tokenize(toks, text);
+
+ for (size_t i = 0; i < toks.size(); ++i) {
+ const string &tok = toks[i];
+ Word *word = new Word(output.size(), tok);
+ output.push_back(word);
+ }
+
+ // is it a labelled span?
+ int endPos = output.size() - 1;
+
+ // fill syntax labels
+ if (!label.empty()) {
+ label = "[" + label + "]";
+ tree.Add(startPos, endPos, label, params);
+ }
+
+ ++childNum;
+ }
+
+}
+
+void AlignedSentenceSyntax::XMLParse(Phrase &output,
+ SyntaxTree &tree,
+ const std::string input,
+ const Parameter &params)
+{
+ pugi::xml_document doc;
+ pugi::xml_parse_result result = doc.load(input.c_str(),
+ pugi::parse_default | pugi::parse_comments);
+
+ pugi::xml_node topNode = doc.child("xml");
+ XMLParse(output, tree, topNode, params);
+}
+
+void AlignedSentenceSyntax::CreateNonTerms()
+{
+ for (int sourceStart = 0; sourceStart < m_source.size(); ++sourceStart) {
+ for (int sourceEnd = sourceStart; sourceEnd < m_source.size(); ++sourceEnd) {
+ ConsistentPhrases::Coll &coll = m_consistentPhrases.GetColl(sourceStart, sourceEnd);
+ const SyntaxTree::Labels &sourceLabels = m_sourceTree.Find(sourceStart, sourceEnd);
+
+ ConsistentPhrases::Coll::iterator iter;
+ for (iter = coll.begin(); iter != coll.end(); ++iter) {
+ ConsistentPhrase &cp = **iter;
+
+ int targetStart = cp.corners[2];
+ int targetEnd = cp.corners[3];
+ const SyntaxTree::Labels &targetLabels = m_targetTree.Find(targetStart, targetEnd);
+
+ CreateNonTerms(cp, sourceLabels, targetLabels);
+ }
+ }
+ }
+
+}
+
+void AlignedSentenceSyntax::CreateNonTerms(ConsistentPhrase &cp,
+ const SyntaxTree::Labels &sourceLabels,
+ const SyntaxTree::Labels &targetLabels)
+{
+ SyntaxTree::Labels::const_iterator iterSource;
+ for (iterSource = sourceLabels.begin(); iterSource != sourceLabels.end(); ++iterSource) {
+ const string &sourceLabel = *iterSource;
+
+ SyntaxTree::Labels::const_iterator iterTarget;
+ for (iterTarget = targetLabels.begin(); iterTarget != targetLabels.end(); ++iterTarget) {
+ const string &targetLabel = *iterTarget;
+ cp.AddNonTerms(sourceLabel, targetLabel);
+ }
+ }
+}
+
+
diff --git a/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.h b/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.h
new file mode 100644
index 000000000..2e9431996
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.h
@@ -0,0 +1,46 @@
+/*
+ * AlignedSentenceSyntax.h
+ *
+ * Created on: 26 Feb 2014
+ * Author: hieu
+ */
+
+#pragma once
+
+#include "AlignedSentence.h"
+#include "SyntaxTree.h"
+#include "pugixml.hpp"
+
+class AlignedSentenceSyntax : public AlignedSentence
+{
+public:
+ AlignedSentenceSyntax(int lineNum,
+ const std::string &source,
+ const std::string &target,
+ const std::string &alignment);
+ virtual ~AlignedSentenceSyntax();
+
+ void Create(const Parameter &params);
+
+ //virtual std::string Debug() const;
+protected:
+ std::string m_sourceStr, m_targetStr, m_alignmentStr;
+ SyntaxTree m_sourceTree, m_targetTree;
+
+ void XMLParse(Phrase &output,
+ SyntaxTree &tree,
+ const std::string input,
+ const Parameter &params);
+ void XMLParse(Phrase &output,
+ SyntaxTree &tree,
+ const pugi::xml_node &parentNode,
+ const Parameter &params);
+ void CreateNonTerms();
+ void CreateNonTerms(ConsistentPhrase &cp,
+ const SyntaxTree::Labels &sourceLabels,
+ const SyntaxTree::Labels &targetLabels);
+ void Populate(bool isSyntax, int mixedSyntaxType, const Parameter &params,
+ std::string line, Phrase &phrase, SyntaxTree &tree);
+
+};
+
diff --git a/phrase-extract/extract-mixed-syntax/ConsistentPhrase.cpp b/phrase-extract/extract-mixed-syntax/ConsistentPhrase.cpp
new file mode 100644
index 000000000..bb913da5a
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrase.cpp
@@ -0,0 +1,66 @@
+/*
+ * ConsistentPhrase.cpp
+ *
+ * Created on: 20 Feb 2014
+ * Author: hieu
+ */
+#include <sstream>
+#include "ConsistentPhrase.h"
+#include "Word.h"
+#include "NonTerm.h"
+#include "Parameter.h"
+
+using namespace std;
+
+ConsistentPhrase::ConsistentPhrase(
+ int sourceStart, int sourceEnd,
+ int targetStart, int targetEnd,
+ const Parameter &params)
+:corners(4)
+,m_hieroNonTerm(*this, params.hieroNonTerm, params.hieroNonTerm)
+{
+ corners[0] = sourceStart;
+ corners[1] = sourceEnd;
+ corners[2] = targetStart;
+ corners[3] = targetEnd;
+}
+
+ConsistentPhrase::~ConsistentPhrase() {
+ // TODO Auto-generated destructor stub
+}
+
+bool ConsistentPhrase::operator<(const ConsistentPhrase &other) const
+{
+ return corners < other.corners;
+}
+
+void ConsistentPhrase::AddNonTerms(const std::string &source,
+ const std::string &target)
+{
+ m_nonTerms.push_back(NonTerm(*this, source, target));
+}
+
+bool ConsistentPhrase::TargetOverlap(const ConsistentPhrase &other) const
+{
+ if ( other.corners[3] < corners[2] || other.corners[2] > corners[3])
+ return false;
+
+ return true;
+}
+
+std::string ConsistentPhrase::Debug() const
+{
+ stringstream out;
+ out << "[" << corners[0] << "-" << corners[1]
+ << "][" << corners[2] << "-" << corners[3] << "]";
+
+ out << "NT:";
+ for (size_t i = 0; i < m_nonTerms.size(); ++i) {
+ const NonTerm &nonTerm = m_nonTerms[i];
+ out << nonTerm.GetLabel(Moses::Input) << ":" << nonTerm.GetLabel(Moses::Output);
+ }
+
+ return out.str();
+}
+
+
diff --git a/phrase-extract/extract-mixed-syntax/ConsistentPhrase.h b/phrase-extract/extract-mixed-syntax/ConsistentPhrase.h
new file mode 100644
index 000000000..865b4386f
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrase.h
@@ -0,0 +1,51 @@
+/*
+ * ConsistentPhrase.h
+ *
+ * Created on: 20 Feb 2014
+ * Author: hieu
+ */
+
+#pragma once
+
+#include <cassert>
+#include <vector>
+#include <iostream>
+#include "moses/TypeDef.h"
+#include "NonTerm.h"
+
+class ConsistentPhrase
+{
+public:
+ typedef std::vector<NonTerm> NonTerms;
+
+ std::vector<int> corners;
+
+ ConsistentPhrase(const ConsistentPhrase &copy); // do not implement
+ ConsistentPhrase(int sourceStart, int sourceEnd,
+ int targetStart, int targetEnd,
+ const Parameter &params);
+
+ virtual ~ConsistentPhrase();
+
+ int GetWidth(Moses::FactorDirection direction) const
+ { return (direction == Moses::Input) ? corners[1] - corners[0] + 1 : corners[3] - corners[2] + 1; }
+
+
+ void AddNonTerms(const std::string &source,
+ const std::string &target);
+ const NonTerms &GetNonTerms() const
+ { return m_nonTerms;}
+ const NonTerm &GetHieroNonTerm() const
+ { return m_hieroNonTerm;}
+
+ bool TargetOverlap(const ConsistentPhrase &other) const;
+
+ bool operator<(const ConsistentPhrase &other) const;
+
+ std::string Debug() const;
+
+protected:
+ NonTerms m_nonTerms;
+ NonTerm m_hieroNonTerm;
+};
+
diff --git a/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
new file mode 100644
index 000000000..8978c88fa
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
@@ -0,0 +1,103 @@
+/*
+ * ConsistentPhrases.cpp
+ *
+ * Created on: 20 Feb 2014
+ * Author: hieu
+ */
+#include <sstream>
+#include <cassert>
+#include "ConsistentPhrases.h"
+#include "NonTerm.h"
+#include "Parameter.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+ConsistentPhrases::ConsistentPhrases()
+{
+}
+
+ConsistentPhrases::~ConsistentPhrases() {
+ for (int start = 0; start < m_coll.size(); ++start) {
+ std::vector<Coll> &allSourceStart = m_coll[start];
+
+ for (int size = 0; size < allSourceStart.size(); ++size) {
+ Coll &coll = allSourceStart[size];
+ Moses::RemoveAllInColl(coll);
+ }
+ }
+}
+
+void ConsistentPhrases::Initialize(size_t size)
+{
+ m_coll.resize(size);
+
+ for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
+ std::vector<Coll> &allSourceStart = m_coll[sourceStart];
+ allSourceStart.resize(size - sourceStart);
+ }
+}
+
+void ConsistentPhrases::Add(int sourceStart, int sourceEnd,
+ int targetStart, int targetEnd,
+ const Parameter &params)
+{
+ Coll &coll = m_coll[sourceStart][sourceEnd - sourceStart];
+ ConsistentPhrase *cp = new ConsistentPhrase(sourceStart, sourceEnd,
+ targetStart, targetEnd,
+ params);
+
+ pair<Coll::iterator, bool> inserted = coll.insert(cp);
+ assert(inserted.second);
+}
+
+const ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd) const
+{
+ const std::vector<Coll> &allSourceStart = m_coll[sourceStart];
+ const Coll &ret = allSourceStart[sourceEnd - sourceStart];
+ return ret;
+}
+
+ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd)
+{
+ std::vector<Coll> &allSourceStart = m_coll[sourceStart];
+ Coll &ret = allSourceStart[sourceEnd - sourceStart];
+ return ret;
+}
+
+std::string ConsistentPhrases::Debug() const
+{
+ std::stringstream out;
+ for (int start = 0; start < m_coll.size(); ++start) {
+ const std::vector<Coll> &allSourceStart = m_coll[start];
+
+ for (int size = 0; size < allSourceStart.size(); ++size) {
+ const Coll &coll = allSourceStart[size];
+
+ Coll::const_iterator iter;
+ for (iter = coll.begin(); iter != coll.end(); ++iter) {
+ const ConsistentPhrase &consistentPhrase = **iter;
+ out << consistentPhrase.Debug() << endl;
+ }
+ }
+ }
+
+ return out.str();
+}
+
+void ConsistentPhrases::AddHieroNonTerms(const Parameter &params)
+{
+ // add [X] labels everywhere
+ for (int i = 0; i < m_coll.size(); ++i) {
+ vector<Coll> &inner = m_coll[i];
+ for (int j = 0; j < inner.size(); ++j) {
+ ConsistentPhrases::Coll &coll = inner[j];
+ ConsistentPhrases::Coll::iterator iter;
+ for (iter = coll.begin(); iter != coll.end(); ++iter) {
+ ConsistentPhrase &cp = **iter;
+ cp.AddNonTerms(params.hieroNonTerm, params.hieroNonTerm);
+ }
+ }
+ }
+}
+
diff --git a/phrase-extract/extract-mixed-syntax/ConsistentPhrases.h b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.h
new file mode 100644
index 000000000..3daf6b7ff
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.h
@@ -0,0 +1,40 @@
+/*
+ * ConsistentPhrases.h
+ *
+ * Created on: 20 Feb 2014
+ * Author: hieu
+ */
+#pragma once
+
+#include <set>
+#include <vector>
+#include <iostream>
+#include "ConsistentPhrase.h"
+
+class Word;
+class Parameter;
+
+class ConsistentPhrases {
+public:
+ typedef std::set<ConsistentPhrase*> Coll;
+
+ ConsistentPhrases();
+ virtual ~ConsistentPhrases();
+
+ void Initialize(size_t size);
+
+ void Add(int sourceStart, int sourceEnd,
+ int targetStart, int targetEnd,
+ const Parameter &params);
+
+ void AddHieroNonTerms(const Parameter &params);
+
+ const Coll &GetColl(int sourceStart, int sourceEnd) const;
+ Coll &GetColl(int sourceStart, int sourceEnd);
+
+ std::string Debug() const;
+
+protected:
+ std::vector< std::vector<Coll> > m_coll;
+};
+
diff --git a/contrib/other-builds/extract-mixed-syntax/InputFileStream.cpp b/phrase-extract/extract-mixed-syntax/InputFileStream.cpp
index b52d1f920..b52d1f920 100644
--- a/contrib/other-builds/extract-mixed-syntax/InputFileStream.cpp
+++ b/phrase-extract/extract-mixed-syntax/InputFileStream.cpp
diff --git a/contrib/other-builds/extract-mixed-syntax/InputFileStream.h b/phrase-extract/extract-mixed-syntax/InputFileStream.h
index f10ec2164..f10ec2164 100644
--- a/contrib/other-builds/extract-mixed-syntax/InputFileStream.h
+++ b/phrase-extract/extract-mixed-syntax/InputFileStream.h
diff --git a/phrase-extract/extract-mixed-syntax/Jamfile b/phrase-extract/extract-mixed-syntax/Jamfile
new file mode 100644
index 000000000..520cd65cb
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Jamfile
@@ -0,0 +1,2 @@
+exe extract-mixed-syntax : Main.cpp AlignedSentence.cpp AlignedSentenceSyntax.cpp ConsistentPhrase.cpp ConsistentPhrases.cpp NonTerm.cpp Parameter.cpp Phrase.cpp pugixml.cpp Rule.cpp RulePhrase.cpp Rules.cpp RuleSymbol.cpp SyntaxTree.cpp Word.cpp ..//deps ../..//z ../..//boost_iostreams ../..//boost_program_options ../../moses//moses : <include>.. ;
+
diff --git a/phrase-extract/extract-mixed-syntax/Main.cpp b/phrase-extract/extract-mixed-syntax/Main.cpp
new file mode 100644
index 000000000..10656b577
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Main.cpp
@@ -0,0 +1,208 @@
+#include <iostream>
+#include <cstdlib>
+#include <boost/program_options.hpp>
+
+#include "Main.h"
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+#include "AlignedSentence.h"
+#include "AlignedSentenceSyntax.h"
+#include "Parameter.h"
+#include "Rules.h"
+
+using namespace std;
+
+bool g_debug = false;
+
+int main(int argc, char** argv)
+{
+ cerr << "Starting" << endl;
+
+ Parameter params;
+
+ namespace po = boost::program_options;
+ po::options_description desc("Options");
+ desc.add_options()
+ ("help", "Print help messages")
+ ("MaxSpan", po::value<int>()->default_value(params.maxSpan), "Max (source) span of a rule. ie. number of words in the source")
+ ("MinSpan", po::value<int>()->default_value(params.minSpan), "Min (source) span of a rule.")
+ ("GlueGrammar", po::value<string>()->default_value(params.gluePath), "Output glue grammar to here")
+ ("SentenceOffset", po::value<long>()->default_value(params.sentenceOffset), "Starting sentence id. Not used")
+ ("GZOutput", "Compress extract files")
+ ("MaxNonTerm", po::value<int>()->default_value(params.maxNonTerm), "Maximum number of non-terms allowed per rule")
+ ("MaxHieroNonTerm", po::value<int>()->default_value(params.maxHieroNonTerm), "Maximum number of Hiero non-term. Usually, --MaxNonTerm is the normal constraint")
+ ("MinHoleSource", po::value<int>()->default_value(params.minHoleSource), "Minimum source span for a non-term.")
+ ("MinHoleSourceSyntax", po::value<int>()->default_value(params.minHoleSourceSyntax), "Minimum source span for a syntactic non-term (source or target).")
+
+ ("SourceSyntax", "Source sentence is a parse tree")
+ ("TargetSyntax", "Target sentence is a parse tree")
+ ("MixedSyntaxType", po::value<int>()->default_value(params.mixedSyntaxType), "Hieu's Mixed syntax type. 0(default)=no mixed syntax, 1=add [X] only if no syntactic label. 2=add [X] everywhere")
+ ("MultiLabel", po::value<int>()->default_value(params.multiLabel), "What to do with multiple labels on the same span. 0(default)=keep them all, 1=keep only top-most, 2=keep only bottom-most")
+ ("HieroSourceLHS", "Always use Hiero source LHS? Default = 0")
+ ("MaxSpanFreeNonTermSource", po::value<int>()->default_value(params.maxSpanFreeNonTermSource), "Max number of words covered by beginning/end NT. Default = 0 (no limit)")
+ ("NoNieceTerminal", "Don't extract rule if 1 of the non-term covers the same word as 1 of the terminals")
+ ("MaxScope", po::value<int>()->default_value(params.maxScope), "maximum scope (see Hopkins and Langmead (2010)). Default is HIGH")
+ ("MinScope", po::value<int>()->default_value(params.minScope), "min scope.")
+
+ ("SpanLength", "Property - span length of RHS each non-term")
+
+ ("NonTermContext", "Property - (source) left and right, inside and outside words of each non-term ")
+ ("NonTermContextTarget", "Property - (target) left and right, inside and outside words of each non-term")
+ ("NonTermContextFactor", po::value<int>()->default_value(params.nonTermContextFactor), "Factor to use for non-term context property.")
+
+ ("NumSourceFactors", po::value<int>()->default_value(params.numSourceFactors), "Number of source factors.")
+ ("NumTargetFactors", po::value<int>()->default_value(params.numTargetFactors), "Number of target factors.")
+
+ ("HieroNonTerm", po::value<string>()->default_value(params.hieroNonTerm), "Hiero non-terminal label, including bracket")
+ ("ScopeSpan", po::value<string>()->default_value(params.scopeSpanStr), "Min and max span for rules of each scope. Format is min,max:min,max...")
+
+ ("NonTermConsecSource", "Allow consecutive non-terms on the source side")
+ ("NonTermConsecSourceMixedSyntax", po::value<int>()->default_value(params.nonTermConsecSourceMixedSyntax), "In mixed syntax mode, what nt can be consecutive. 0=don't allow consec nt. 1(default)=hiero+syntax. 2=syntax+syntax. 3=always allow");
+
+
+ po::variables_map vm;
+ try
+ {
+ po::store(po::parse_command_line(argc, argv, desc),
+ vm); // can throw
+
+ /** --help option
+ */
+ if ( vm.count("help") || argc < 5 )
+ {
+ std::cout << argv[0] << " target source alignment [options...]" << std::endl
+ << desc << std::endl;
+ return EXIT_SUCCESS;
+ }
+
+ po::notify(vm); // throws on error, so do after help in case
+ // there are any problems
+ }
+ catch(po::error& e)
+ {
+ std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
+ std::cerr << desc << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ if (vm.count("MaxSpan")) params.maxSpan = vm["MaxSpan"].as<int>();
+ if (vm.count("MinSpan")) params.minSpan = vm["MinSpan"].as<int>();
+ if (vm.count("GZOutput")) params.gzOutput = true;
+ if (vm.count("GlueGrammar")) params.gluePath = vm["GlueGrammar"].as<string>();
+ if (vm.count("SentenceOffset")) params.sentenceOffset = vm["SentenceOffset"].as<long>();
+ if (vm.count("MaxNonTerm")) params.maxNonTerm = vm["MaxNonTerm"].as<int>();
+ if (vm.count("MaxHieroNonTerm")) params.maxHieroNonTerm = vm["MaxHieroNonTerm"].as<int>();
+ if (vm.count("MinHoleSource")) params.minHoleSource = vm["MinHoleSource"].as<int>();
+ if (vm.count("MinHoleSourceSyntax")) params.minHoleSourceSyntax = vm["MinHoleSourceSyntax"].as<int>();
+
+ if (vm.count("SourceSyntax")) params.sourceSyntax = true;
+ if (vm.count("TargetSyntax")) params.targetSyntax = true;
+ if (vm.count("MixedSyntaxType")) params.mixedSyntaxType = vm["MixedSyntaxType"].as<int>();
+ if (vm.count("MultiLabel")) params.multiLabel = vm["MultiLabel"].as<int>();
+ if (vm.count("HieroSourceLHS")) params.hieroSourceLHS = true;
+ if (vm.count("MaxSpanFreeNonTermSource")) params.maxSpanFreeNonTermSource = vm["MaxSpanFreeNonTermSource"].as<int>();
+ if (vm.count("NoNieceTerminal")) params.nieceTerminal = false;
+ if (vm.count("MaxScope")) params.maxScope = vm["MaxScope"].as<int>();
+ if (vm.count("MinScope")) params.minScope = vm["MinScope"].as<int>();
+
+ // properties
+ if (vm.count("SpanLength")) params.spanLength = true;
+ if (vm.count("NonTermContext")) params.nonTermContext = true;
+ if (vm.count("NonTermContextTarget")) params.nonTermContextTarget = true;
+ if (vm.count("NonTermContextFactor")) params.nonTermContextFactor = vm["NonTermContextFactor"].as<int>();
+
+ if (vm.count("NumSourceFactors")) params.numSourceFactors = vm["NumSourceFactors"].as<int>();
+ if (vm.count("NumTargetFactors")) params.numTargetFactors = vm["NumTargetFactors"].as<int>();
+
+ if (vm.count("HieroNonTerm")) params.hieroNonTerm = vm["HieroNonTerm"].as<string>();
+ if (vm.count("ScopeSpan")) {
+ params.SetScopeSpan(vm["ScopeSpan"].as<string>());
+ }
+
+ if (vm.count("NonTermConsecSource")) params.nonTermConsecSource = true;
+ if (vm.count("NonTermConsecSourceMixedSyntax")) params.nonTermConsecSourceMixedSyntax = vm["NonTermConsecSourceMixedSyntax"].as<int>();
+
+
+ // input files;
+ string pathTarget = argv[1];
+ string pathSource = argv[2];
+ string pathAlignment = argv[3];
+
+ string pathExtract = argv[4];
+ string pathExtractInv = pathExtract + ".inv";
+ if (params.gzOutput) {
+ pathExtract += ".gz";
+ pathExtractInv += ".gz";
+ }
+
+ Moses::InputFileStream strmTarget(pathTarget);
+ Moses::InputFileStream strmSource(pathSource);
+ Moses::InputFileStream strmAlignment(pathAlignment);
+ Moses::OutputFileStream extractFile(pathExtract);
+ Moses::OutputFileStream extractInvFile(pathExtractInv);
+
+
+ // MAIN LOOP
+ int lineNum = 1;
+ string lineTarget, lineSource, lineAlignment;
+ while (getline(strmTarget, lineTarget)) {
+ if (lineNum % 10000 == 0) {
+ cerr << lineNum << " ";
+ }
+
+ bool success;
+ success = getline(strmSource, lineSource);
+ if (!success) {
+ throw "Couldn't read source";
+ }
+ success = getline(strmAlignment, lineAlignment);
+ if (!success) {
+ throw "Couldn't read alignment";
+ }
+
+ /*
+ cerr << "lineTarget=" << lineTarget << endl;
+ cerr << "lineSource=" << lineSource << endl;
+ cerr << "lineAlignment=" << lineAlignment << endl;
+ */
+
+ AlignedSentence *alignedSentence;
+
+ if (params.sourceSyntax || params.targetSyntax) {
+ alignedSentence = new AlignedSentenceSyntax(lineNum, lineSource, lineTarget, lineAlignment);
+ }
+ else {
+ alignedSentence = new AlignedSentence(lineNum, lineSource, lineTarget, lineAlignment);
+ }
+
+ alignedSentence->Create(params);
+ //cerr << alignedSentence->Debug();
+
+ Rules rules(*alignedSentence);
+ rules.Extend(params);
+ rules.Consolidate(params);
+ //cerr << rules.Debug();
+
+ rules.Output(extractFile, true, params);
+ rules.Output(extractInvFile, false, params);
+
+ delete alignedSentence;
+
+ ++lineNum;
+ }
+
+ if (!params.gluePath.empty()) {
+ Moses::OutputFileStream glueFile(params.gluePath);
+ CreateGlueGrammar(glueFile);
+ }
+
+ cerr << "Finished" << endl;
+}
+
+void CreateGlueGrammar(Moses::OutputFileStream &glueFile)
+{
+ glueFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
+ << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
+ << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
+
+}
diff --git a/phrase-extract/extract-mixed-syntax/Main.h b/phrase-extract/extract-mixed-syntax/Main.h
new file mode 100644
index 000000000..9744ba389
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Main.h
@@ -0,0 +1,12 @@
+/*
+ * Main.h
+ *
+ * Created on: 28 Feb 2014
+ * Author: hieu
+ */
+#pragma once
+
+#include "OutputFileStream.h"
+
+void CreateGlueGrammar(Moses::OutputFileStream &glueFile);
+
diff --git a/phrase-extract/extract-mixed-syntax/NonTerm.cpp b/phrase-extract/extract-mixed-syntax/NonTerm.cpp
new file mode 100644
index 000000000..5de780a9a
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/NonTerm.cpp
@@ -0,0 +1,66 @@
+/*
+ * NonTerm.cpp
+ *
+ * Created on: 22 Feb 2014
+ * Author: hieu
+ */
+
+#include <sstream>
+#include "NonTerm.h"
+#include "Word.h"
+#include "ConsistentPhrase.h"
+#include "Parameter.h"
+
+using namespace std;
+
+NonTerm::NonTerm(const ConsistentPhrase &consistentPhrase,
+ const std::string &source,
+ const std::string &target)
+:m_consistentPhrase(&consistentPhrase)
+,m_source(source)
+,m_target(target)
+{
+ // TODO Auto-generated constructor stub
+
+}
+
+NonTerm::~NonTerm() {
+ // TODO Auto-generated destructor stub
+}
+
+std::string NonTerm::Debug() const
+{
+ stringstream out;
+ out << m_source << m_target;
+ out << m_consistentPhrase->Debug();
+ return out.str();
+}
+
+void NonTerm::Output(std::ostream &out) const
+{
+ out << m_source << m_target;
+}
+
+void NonTerm::Output(std::ostream &out, Moses::FactorDirection direction) const
+{
+ out << GetLabel(direction);
+}
+
+const std::string &NonTerm::GetLabel(Moses::FactorDirection direction) const
+{
+ return (direction == Moses::Input) ? m_source : m_target;
+}
+
+bool NonTerm::IsHiero(Moses::FactorDirection direction, const Parameter &params) const
+{
+ const std::string &label = NonTerm::GetLabel(direction);
+ return label == params.hieroNonTerm;
+}
+
+bool NonTerm::IsHiero(const Parameter &params) const
+{
+ return IsHiero(Moses::Input, params) && IsHiero(Moses::Output, params);
+}
+
+int NonTerm::GetWidth(Moses::FactorDirection direction) const
+{ return GetConsistentPhrase().GetWidth(direction); }
diff --git a/phrase-extract/extract-mixed-syntax/NonTerm.h b/phrase-extract/extract-mixed-syntax/NonTerm.h
new file mode 100644
index 000000000..5b3bb9f04
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/NonTerm.h
@@ -0,0 +1,47 @@
+/*
+ * NonTerm.h
+ *
+ * Created on: 22 Feb 2014
+ * Author: hieu
+ */
+#pragma once
+#include <string>
+#include "RuleSymbol.h"
+#include "moses/TypeDef.h"
+
+class ConsistentPhrase;
+class Parameter;
+
+class NonTerm : public RuleSymbol
+{
+public:
+
+ NonTerm(const ConsistentPhrase &consistentPhrase,
+ const std::string &source,
+ const std::string &target);
+ virtual ~NonTerm();
+
+ const ConsistentPhrase &GetConsistentPhrase() const
+ { return *m_consistentPhrase; }
+
+ int GetWidth(Moses::FactorDirection direction) const;
+
+ virtual bool IsNonTerm() const
+ { return true; }
+
+ std::string GetString() const
+ { return m_source + m_target; }
+
+ virtual std::string Debug() const;
+ virtual void Output(std::ostream &out) const;
+ void Output(std::ostream &out, Moses::FactorDirection direction) const;
+
+ const std::string &GetLabel(Moses::FactorDirection direction) const;
+ bool IsHiero(Moses::FactorDirection direction, const Parameter &params) const;
+ bool IsHiero(const Parameter &params) const;
+
+protected:
+ const ConsistentPhrase *m_consistentPhrase;
+ std::string m_source, m_target;
+};
+
diff --git a/phrase-extract/extract-mixed-syntax/Parameter.cpp b/phrase-extract/extract-mixed-syntax/Parameter.cpp
new file mode 100644
index 000000000..4f742e774
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Parameter.cpp
@@ -0,0 +1,72 @@
+/*
+ * Parameter.cpp
+ *
+ * Created on: 17 Feb 2014
+ * Author: hieu
+ */
+#include "Parameter.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+
+using namespace std;
+
+Parameter::Parameter()
+:maxSpan(10)
+,minSpan(0)
+,maxNonTerm(2)
+,maxHieroNonTerm(999)
+,maxSymbolsTarget(999)
+,maxSymbolsSource(5)
+,minHoleSource(2)
+,minHoleSourceSyntax(1)
+,sentenceOffset(0)
+,nonTermConsecSource(false)
+,requireAlignedWord(true)
+,fractionalCounting(true)
+,gzOutput(false)
+
+,hieroNonTerm("[X]")
+,sourceSyntax(false)
+,targetSyntax(false)
+
+,mixedSyntaxType(0)
+,multiLabel(0)
+,nonTermConsecSourceMixed(true)
+,hieroSourceLHS(false)
+,maxSpanFreeNonTermSource(0)
+,nieceTerminal(true)
+,maxScope(UNDEFINED)
+,minScope(0)
+
+,spanLength(false)
+,nonTermContext(false)
+,nonTermContextTarget(false)
+,nonTermContextFactor(0)
+
+,numSourceFactors(1)
+,numTargetFactors(1)
+
+,nonTermConsecSourceMixedSyntax(1)
+{}
+
+Parameter::~Parameter() {
+ // TODO Auto-generated destructor stub
+}
+
+void Parameter::SetScopeSpan(const std::string &str)
+{
+ scopeSpanStr = str;
+ vector<string> toks1;
+ Moses::Tokenize(toks1, str, ":");
+
+ for (size_t i = 0; i < toks1.size(); ++i) {
+ const string &tok1 = toks1[i];
+
+ vector<int> toks2;
+ Moses::Tokenize<int>(toks2, tok1, ",");
+ UTIL_THROW_IF2(toks2.size() != 2, "Format is min,max:min,max... String is " << tok1);
+
+ std::pair<int,int> values(toks2[0], toks2[1]);
+ scopeSpan.push_back(values);
+ }
+}
diff --git a/phrase-extract/extract-mixed-syntax/Parameter.h b/phrase-extract/extract-mixed-syntax/Parameter.h
new file mode 100644
index 000000000..1a9018504
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Parameter.h
@@ -0,0 +1,65 @@
+/*
+ * Parameter.h
+ *
+ * Created on: 17 Feb 2014
+ * Author: hieu
+ */
+#pragma once
+
+#include <string>
+#include <limits>
+#include <vector>
+
+#define UNDEFINED std::numeric_limits<int>::max()
+
+class Parameter
+{
+public:
+ Parameter();
+ virtual ~Parameter();
+
+ int maxSpan;
+ int minSpan;
+ int maxNonTerm;
+ int maxHieroNonTerm;
+ int maxSymbolsTarget;
+ int maxSymbolsSource;
+ int minHoleSource;
+ int minHoleSourceSyntax;
+
+ long sentenceOffset;
+
+ bool nonTermConsecSource;
+ bool requireAlignedWord;
+ bool fractionalCounting;
+ bool gzOutput;
+
+ std::string hieroNonTerm;
+ std::string gluePath;
+
+ bool sourceSyntax, targetSyntax;
+
+ int mixedSyntaxType, multiLabel;
+ bool nonTermConsecSourceMixed;
+ bool hieroSourceLHS;
+ int maxSpanFreeNonTermSource;
+ bool nieceTerminal;
+ int maxScope, minScope;
+
+ // properties
+ bool spanLength;
+ bool nonTermContext;
+ bool nonTermContextTarget;
+ int nonTermContextFactor;
+
+ int numSourceFactors, numTargetFactors;
+
+ int nonTermConsecSourceMixedSyntax;
+
+ std::string scopeSpanStr;
+ std::vector<std::pair<int,int> > scopeSpan;
+
+ void SetScopeSpan(const std::string &str);
+
+};
+
diff --git a/phrase-extract/extract-mixed-syntax/Phrase.cpp b/phrase-extract/extract-mixed-syntax/Phrase.cpp
new file mode 100644
index 000000000..535e10d6b
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Phrase.cpp
@@ -0,0 +1,14 @@
+#include <sstream>
+#include "Phrase.h"
+
+std::string Phrase::Debug() const
+{
+ std::stringstream out;
+
+ for (size_t i = 0; i < size(); ++i) {
+ Word &word = *at(i);
+ out << word.Debug() << " ";
+ }
+
+ return out.str();
+}
diff --git a/phrase-extract/extract-mixed-syntax/Phrase.h b/phrase-extract/extract-mixed-syntax/Phrase.h
new file mode 100644
index 000000000..13912cb95
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Phrase.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <vector>
+#include "Word.h"
+
+// a vector of terminals
+class Phrase : public std::vector<Word*>
+{
+public:
+ Phrase()
+ {}
+
+ Phrase(size_t size)
+ :std::vector<Word*>(size)
+ {}
+
+ std::string Debug() const;
+
+};
diff --git a/phrase-extract/extract-mixed-syntax/Rule.cpp b/phrase-extract/extract-mixed-syntax/Rule.cpp
new file mode 100644
index 000000000..a3e148e6c
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Rule.cpp
@@ -0,0 +1,662 @@
+/*
+ * Rule.cpp
+ *
+ * Created on: 20 Feb 2014
+ * Author: hieu
+ */
+
+#include <sstream>
+#include <algorithm>
+#include "Rule.h"
+#include "AlignedSentence.h"
+#include "ConsistentPhrase.h"
+#include "NonTerm.h"
+#include "Parameter.h"
+
+using namespace std;
+
+Rule::Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence)
+:m_lhs(lhsNonTerm)
+,m_alignedSentence(alignedSentence)
+,m_isValid(true)
+,m_canRecurse(true)
+{
+ CreateSource();
+}
+
+Rule::Rule(const Rule &copy, const NonTerm &nonTerm)
+:m_lhs(copy.m_lhs)
+,m_alignedSentence(copy.m_alignedSentence)
+,m_isValid(true)
+,m_canRecurse(true)
+,m_nonterms(copy.m_nonterms)
+{
+ m_nonterms.push_back(&nonTerm);
+ CreateSource();
+
+}
+
+Rule::~Rule() {
+ // TODO Auto-generated destructor stub
+}
+
+const ConsistentPhrase &Rule::GetConsistentPhrase() const
+{ return m_lhs.GetConsistentPhrase(); }
+
+void Rule::CreateSource()
+{
+ const NonTerm *cp = NULL;
+ size_t nonTermInd = 0;
+ if (nonTermInd < m_nonterms.size()) {
+ cp = m_nonterms[nonTermInd];
+ }
+
+ for (int sourcePos = m_lhs.GetConsistentPhrase().corners[0];
+ sourcePos <= m_lhs.GetConsistentPhrase().corners[1];
+ ++sourcePos) {
+
+ const RuleSymbol *ruleSymbol;
+ if (cp && cp->GetConsistentPhrase().corners[0] <= sourcePos && sourcePos <= cp->GetConsistentPhrase().corners[1]) {
+ // replace words with non-term
+ ruleSymbol = cp;
+ sourcePos = cp->GetConsistentPhrase().corners[1];
+ if (m_nonterms.size()) {
+ cp = m_nonterms[nonTermInd];
+ }
+
+ // move to next non-term
+ ++nonTermInd;
+ cp = (nonTermInd < m_nonterms.size()) ? m_nonterms[nonTermInd] : NULL;
+ }
+ else {
+ // terminal
+ ruleSymbol = m_alignedSentence.GetPhrase(Moses::Input)[sourcePos];
+ }
+
+ m_source.Add(ruleSymbol);
+ }
+}
+
+int Rule::GetNextSourcePosForNonTerm() const
+{
+ if (m_nonterms.empty()) {
+ // no non-terms so far. Can start next non-term on left corner
+ return m_lhs.GetConsistentPhrase().corners[0];
+ }
+ else {
+ // next non-term can start just left of previous
+ const ConsistentPhrase &cp = m_nonterms.back()->GetConsistentPhrase();
+ int nextPos = cp.corners[1] + 1;
+ return nextPos;
+ }
+}
+
+std::string Rule::Debug() const
+{
+ stringstream out;
+
+ // source
+ for (size_t i = 0; i < m_source.GetSize(); ++i) {
+ const RuleSymbol &symbol = *m_source[i];
+ out << symbol.Debug() << " ";
+ }
+
+ // target
+ out << "||| ";
+ for (size_t i = 0; i < m_target.GetSize(); ++i) {
+ const RuleSymbol &symbol = *m_target[i];
+ out << symbol.Debug() << " ";
+ }
+
+ out << "||| ";
+ Alignments::const_iterator iterAlign;
+ for (iterAlign = m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) {
+ const std::pair<int,int> &alignPair = *iterAlign;
+ out << alignPair.first << "-" << alignPair.second << " ";
+ }
+
+ // overall range
+ out << "||| LHS=" << m_lhs.Debug();
+
+ return out.str();
+}
+
+void Rule::Output(std::ostream &out, bool forward, const Parameter &params) const
+{
+ if (forward) {
+ // source
+ m_source.Output(out);
+ m_lhs.Output(out, Moses::Input);
+
+ out << " ||| ";
+
+ // target
+ m_target.Output(out);
+ m_lhs.Output(out, Moses::Output);
+ }
+ else {
+ // target
+ m_target.Output(out);
+ m_lhs.Output(out, Moses::Output);
+
+ out << " ||| ";
+
+ // source
+ m_source.Output(out);
+ m_lhs.Output(out, Moses::Input);
+ }
+
+ out << " ||| ";
+
+ // alignment
+ Alignments::const_iterator iterAlign;
+ for (iterAlign = m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) {
+ const std::pair<int,int> &alignPair = *iterAlign;
+
+ if (forward) {
+ out << alignPair.first << "-" << alignPair.second << " ";
+ }
+ else {
+ out << alignPair.second << "-" << alignPair.first << " ";
+ }
+ }
+
+ out << "||| ";
+
+ // count
+ out << m_count;
+
+ out << " ||| ";
+
+ // properties
+
+ // span length
+ if (forward && params.spanLength && m_nonterms.size()) {
+ out << "{{SpanLength ";
+
+ for (size_t i = 0; i < m_nonterms.size(); ++i) {
+ const NonTerm &nonTerm = *m_nonterms[i];
+ const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
+ out << i << "," << cp.GetWidth(Moses::Input) << "," << cp.GetWidth(Moses::Output) << " ";
+ }
+ out << "}} ";
+ }
+
+ // non-term context (source)
+ if (forward && params.nonTermContext && m_nonterms.size()) {
+ out << "{{NonTermContext ";
+
+ int factor = params.nonTermContextFactor;
+
+ for (size_t i = 0; i < m_nonterms.size(); ++i) {
+ const NonTerm &nonTerm = *m_nonterms[i];
+ const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
+ NonTermContext(1, factor, i, cp, out);
+ }
+ out << "}} ";
+ }
+
+ // non-term context (target)
+ if (forward && params.nonTermContextTarget && m_nonterms.size()) {
+ out << "{{NonTermContextTarget ";
+
+ int factor = params.nonTermContextFactor;
+
+ for (size_t i = 0; i < m_nonterms.size(); ++i) {
+ const NonTerm &nonTerm = *m_nonterms[i];
+ const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
+ NonTermContext(2, factor, i, cp, out);
+ }
+ out << "}} ";
+ }
+
+}
+
+void Rule::NonTermContextFactor(int factor, const Word &word, std::ostream &out) const
+{
+ out << word.GetString(factor) << " ";
+}
+
+void Rule::NonTermContext(int sourceTarget, int factor, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const
+{
+ int startPos, endPos;
+ const Phrase *phrase;
+
+ if (sourceTarget == 1) {
+ startPos = cp.corners[0];
+ endPos = cp.corners[1];
+ phrase = &m_alignedSentence.GetPhrase(Moses::Input);
+ }
+ else if (sourceTarget == 2) {
+ startPos = cp.corners[2];
+ endPos = cp.corners[3];
+ phrase = &m_alignedSentence.GetPhrase(Moses::Output);
+ }
+ else {
+ abort();
+ }
+
+ out << ntInd << " ";
+
+ // left outside
+ if (startPos == 0) {
+ out << "<s> ";
+ }
+ else {
+ NonTermContextFactor(factor, *phrase->at(startPos - 1), out);
+ }
+
+ // left inside
+ NonTermContextFactor(factor, *phrase->at(startPos), out);
+
+ // right inside
+ NonTermContextFactor(factor, *phrase->at(endPos), out);
+
+ // right outside
+ if (endPos == phrase->size() - 1) {
+ out << "</s> ";
+ }
+ else {
+ NonTermContextFactor(factor, *phrase->at(endPos + 1), out);
+ }
+
+
+}
+
+void Rule::Prevalidate(const Parameter &params)
+{
+ const ConsistentPhrase &cp = m_lhs.GetConsistentPhrase();
+
+ // check number of source symbols in rule
+ if (m_source.GetSize() > params.maxSymbolsSource) {
+ m_isValid = false;
+ }
+
+ // check that last non-term added isn't too small
+ if (m_nonterms.size()) {
+ const NonTerm &lastNonTerm = *m_nonterms.back();
+ const ConsistentPhrase &cp = lastNonTerm.GetConsistentPhrase();
+
+ int sourceWidth = cp.GetWidth(Moses::Input);
+ if (lastNonTerm.IsHiero(params)) {
+ if (sourceWidth < params.minHoleSource) {
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ }
+ else if (sourceWidth < params.minHoleSourceSyntax) {
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+
+ }
+
+ // check number of non-terms
+ int numNonTerms = 0;
+ int numHieroNonTerms = 0;
+ for (size_t i = 0; i < m_source.GetSize(); ++i) {
+ const RuleSymbol *arc = m_source[i];
+ if (arc->IsNonTerm()) {
+ ++numNonTerms;
+ const NonTerm &nonTerm = *static_cast<const NonTerm*>(arc);
+ bool isHiero = nonTerm.IsHiero(params);
+ if (isHiero) {
+ ++numHieroNonTerms;
+ }
+ }
+ }
+
+ if (numNonTerms >= params.maxNonTerm) {
+ m_canRecurse = false;
+ if (numNonTerms > params.maxNonTerm) {
+ m_isValid = false;
+ return;
+ }
+ }
+
+ if (numHieroNonTerms >= params.maxHieroNonTerm) {
+ m_canRecurse = false;
+ if (numHieroNonTerms > params.maxHieroNonTerm) {
+ m_isValid = false;
+ return;
+ }
+ }
+
+ // check if 2 consecutive non-terms in source
+ if (!params.nonTermConsecSource && m_nonterms.size() >= 2) {
+ const NonTerm &lastNonTerm = *m_nonterms.back();
+ const NonTerm &secondLastNonTerm = *m_nonterms[m_nonterms.size() - 2];
+ if (secondLastNonTerm.GetConsistentPhrase().corners[1] + 1 ==
+ lastNonTerm.GetConsistentPhrase().corners[0]) {
+ if (params.mixedSyntaxType == 0) {
+ // ordinary hiero or syntax model
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ else {
+ // Hieu's mixed syntax
+ switch (params.nonTermConsecSourceMixedSyntax) {
+ case 0:
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ case 1:
+ if (lastNonTerm.IsHiero(Moses::Input, params)
+ && secondLastNonTerm.IsHiero(Moses::Input, params)) {
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ break;
+ case 2:
+ if (lastNonTerm.IsHiero(Moses::Input, params)
+ || secondLastNonTerm.IsHiero(Moses::Input, params)) {
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ break;
+ case 3:
+ break;
+ } // switch
+ }
+ }
+ }
+
+ //check to see if it overlaps with any other non-terms
+ if (m_nonterms.size() >= 2) {
+ const NonTerm &lastNonTerm = *m_nonterms.back();
+
+ for (size_t i = 0; i < m_nonterms.size() - 1; ++i) {
+ const NonTerm &otherNonTerm = *m_nonterms[i];
+ bool overlap = lastNonTerm.GetConsistentPhrase().TargetOverlap(otherNonTerm.GetConsistentPhrase());
+
+ if (overlap) {
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ }
+ }
+
+ // check that at least 1 word is aligned
+ if (params.requireAlignedWord) {
+ bool ok = false;
+ for (size_t i = 0; i < m_source.GetSize(); ++i) {
+ const RuleSymbol &symbol = *m_source[i];
+ if (!symbol.IsNonTerm()) {
+ const Word &word = static_cast<const Word&>(symbol);
+ if (word.GetAlignment().size()) {
+ ok = true;
+ break;
+ }
+ }
+ }
+
+ if (!ok) {
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ }
+
+ if (params.maxSpanFreeNonTermSource) {
+ const NonTerm *front = dynamic_cast<const NonTerm*>(m_source[0]);
+ if (front) {
+ int width = front->GetWidth(Moses::Input);
+ if (width > params.maxSpanFreeNonTermSource) {
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ }
+
+ const NonTerm *back = dynamic_cast<const NonTerm*>(m_source.Back());
+ if (back) {
+ int width = back->GetWidth(Moses::Input);
+ if (width > params.maxSpanFreeNonTermSource) {
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ }
+ }
+
+ if (!params.nieceTerminal) {
+ // collect terminal in a rule
+ std::set<const Word*> terms;
+ for (size_t i = 0; i < m_source.GetSize(); ++i) {
+ const Word *word = dynamic_cast<const Word*>(m_source[i]);
+ if (word) {
+ terms.insert(word);
+ }
+ }
+
+ // look in non-terms
+ for (size_t i = 0; i < m_source.GetSize(); ++i) {
+ const NonTerm *nonTerm = dynamic_cast<const NonTerm*>(m_source[i]);
+ if (nonTerm) {
+ const ConsistentPhrase &cp = nonTerm->GetConsistentPhrase();
+ bool containTerm = ContainTerm(cp, terms);
+
+ if (containTerm) {
+ //cerr << "ruleSource=" << *ruleSource << " ";
+ //cerr << "ntRange=" << ntRange << endl;
+
+ // non-term contains 1 of the terms in the rule.
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ }
+ }
+ }
+
+ if (params.maxScope != UNDEFINED || params.minScope > 0) {
+ int scope = GetScope(params);
+ if (scope > params.maxScope) {
+ // scope of subsequent rules will be the same or increase
+ // therefore can NOT recurse
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+
+ if (scope < params.minScope) {
+ // scope of subsequent rules may increase
+ // therefore can recurse
+ m_isValid = false;
+ }
+ }
+
+ // min/max span per scope
+ if (params.scopeSpan.size()) {
+ int scope = GetScope(params);
+ if (scope >= params.scopeSpan.size()) {
+ // no constraint on it. It's ok
+ }
+ else {
+ const std::pair<int,int> &constraint = params.scopeSpan[scope];
+ int sourceWidth = m_lhs.GetWidth(Moses::Input);
+ if (sourceWidth < constraint.first || sourceWidth > constraint.second) {
+ m_isValid = false;
+ m_canRecurse = false;
+ return;
+ }
+ }
+ }
+}
+
+int Rule::GetScope(const Parameter &params) const
+{
+ size_t scope = 0;
+ bool previousIsAmbiguous = false;
+
+ if (m_source[0]->IsNonTerm()) {
+ scope++;
+ previousIsAmbiguous = true;
+ }
+
+ for (size_t i = 1; i < m_source.GetSize(); ++i) {
+ const RuleSymbol *symbol = m_source[i];
+ bool isAmbiguous = symbol->IsNonTerm();
+ if (isAmbiguous) {
+ // mixed syntax
+ const NonTerm *nt = static_cast<const NonTerm*>(symbol);
+ isAmbiguous = nt->IsHiero(Moses::Input, params);
+ }
+
+ if (isAmbiguous && previousIsAmbiguous) {
+ scope++;
+ }
+ previousIsAmbiguous = isAmbiguous;
+ }
+
+ if (previousIsAmbiguous) {
+ scope++;
+ }
+
+ return scope;
+
+ /*
+ int scope = 0;
+ if (m_source.GetSize() > 1) {
+ const RuleSymbol &front = *m_source.Front();
+ if (front.IsNonTerm()) {
+ ++scope;
+ }
+
+ const RuleSymbol &back = *m_source.Back();
+ if (back.IsNonTerm()) {
+ ++scope;
+ }
+ }
+ return scope;
+ */
+}
+
+template<typename T>
+bool Contains(const T *sought, const set<const T*> &coll)
+{
+ std::set<const Word*>::const_iterator iter;
+ for (iter = coll.begin(); iter != coll.end(); ++iter) {
+ const Word *found = *iter;
+ if (sought->CompareString(*found) == 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool Rule::ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const
+{
+ const Phrase &sourceSentence = m_alignedSentence.GetPhrase(Moses::Input);
+
+ for (int pos = cp.corners[0]; pos <= cp.corners[1]; ++pos) {
+ const Word *soughtWord = sourceSentence[pos];
+
+ // find same word in set
+ if (Contains(soughtWord, terms)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CompareTargetNonTerms(const NonTerm *a, const NonTerm *b)
+{
+ // compare just start target pos
+ return a->GetConsistentPhrase().corners[2] < b->GetConsistentPhrase().corners[2];
+}
+
+void Rule::CreateTarget(const Parameter &params)
+{
+ if (!m_isValid) {
+ return;
+ }
+
+ vector<const NonTerm*> targetNonTerm(m_nonterms);
+ std::sort(targetNonTerm.begin(), targetNonTerm.end(), CompareTargetNonTerms);
+
+ const NonTerm *cp = NULL;
+ size_t nonTermInd = 0;
+ if (nonTermInd < targetNonTerm.size()) {
+ cp = targetNonTerm[nonTermInd];
+ }
+
+ for (int targetPos = m_lhs.GetConsistentPhrase().corners[2];
+ targetPos <= m_lhs.GetConsistentPhrase().corners[3];
+ ++targetPos) {
+
+ const RuleSymbol *ruleSymbol;
+ if (cp && cp->GetConsistentPhrase().corners[2] <= targetPos && targetPos <= cp->GetConsistentPhrase().corners[3]) {
+ // replace words with non-term
+ ruleSymbol = cp;
+ targetPos = cp->GetConsistentPhrase().corners[3];
+ if (targetNonTerm.size()) {
+ cp = targetNonTerm[nonTermInd];
+ }
+
+ // move to next non-term
+ ++nonTermInd;
+ cp = (nonTermInd < targetNonTerm.size()) ? targetNonTerm[nonTermInd] : NULL;
+ }
+ else {
+ // terminal
+ ruleSymbol = m_alignedSentence.GetPhrase(Moses::Output)[targetPos];
+ }
+
+ m_target.Add(ruleSymbol);
+ }
+
+ CreateAlignments();
+}
+
+
+void Rule::CreateAlignments()
+{
+ int sourceStart = GetConsistentPhrase().corners[0];
+ int targetStart = GetConsistentPhrase().corners[2];
+
+ for (size_t sourcePos = 0; sourcePos < m_source.GetSize(); ++sourcePos) {
+ const RuleSymbol *symbol = m_source[sourcePos];
+ if (!symbol->IsNonTerm()) {
+ // terminals
+ const Word &sourceWord = static_cast<const Word&>(*symbol);
+ const std::set<const Word *> &targetWords = sourceWord.GetAlignment();
+ CreateAlignments(sourcePos, targetWords);
+ }
+ else {
+ // non-terms. same object in both source & target
+ CreateAlignments(sourcePos, symbol);
+ }
+ }
+}
+
+void Rule::CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords)
+{
+ std::set<const Word *>::const_iterator iterTarget;
+ for (iterTarget = targetWords.begin(); iterTarget != targetWords.end(); ++iterTarget) {
+ const Word *targetWord = *iterTarget;
+ CreateAlignments(sourcePos, targetWord);
+ }
+}
+
+void Rule::CreateAlignments(int sourcePos, const RuleSymbol *targetSought)
+{
+ // should be in target phrase
+ for (size_t targetPos = 0; targetPos < m_target.GetSize(); ++targetPos) {
+ const RuleSymbol *foundSymbol = m_target[targetPos];
+ if (targetSought == foundSymbol) {
+ pair<int, int> alignPoint(sourcePos, targetPos);
+ m_alignments.insert(alignPoint);
+ return;
+ }
+ }
+
+ throw "not found";
+}
+
diff --git a/phrase-extract/extract-mixed-syntax/Rule.h b/phrase-extract/extract-mixed-syntax/Rule.h
new file mode 100644
index 000000000..15a142b97
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Rule.h
@@ -0,0 +1,90 @@
+/*
+ * Rule.h
+ *
+ * Created on: 20 Feb 2014
+ * Author: hieu
+ */
+#pragma once
+#include <vector>
+#include "Phrase.h"
+#include "RulePhrase.h"
+#include "moses/TypeDef.h"
+
+class ConsistentPhrase;
+class AlignedSentence;
+class NonTerm;
+class Parameter;
+
+
+class Rule {
+public:
+ typedef std::set<std::pair<int,int> > Alignments;
+
+ Rule(const Rule &copy); // do not implement
+
+ // original rule with no non-term
+ Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence);
+
+ // extend a rule, adding 1 new non-term
+ Rule(const Rule &copy, const NonTerm &nonTerm);
+
+ virtual ~Rule();
+
+ bool IsValid() const
+ { return m_isValid; }
+
+ bool CanRecurse() const
+ { return m_canRecurse; }
+
+ const NonTerm &GetLHS() const
+ { return m_lhs; }
+
+ const ConsistentPhrase &GetConsistentPhrase() const;
+
+ int GetNextSourcePosForNonTerm() const;
+
+ void SetCount(float count)
+ { m_count = count; }
+ float GetCount() const
+ { return m_count; }
+
+ const Alignments &GetAlignments() const
+ { return m_alignments; }
+
+ std::string Debug() const;
+ void Output(std::ostream &out, bool forward, const Parameter &params) const;
+
+ void Prevalidate(const Parameter &params);
+ void CreateTarget(const Parameter &params);
+
+ const RulePhrase &GetPhrase(Moses::FactorDirection direction) const
+ { return (direction == Moses::Input) ? m_source : m_target; }
+
+protected:
+ const NonTerm &m_lhs;
+ const AlignedSentence &m_alignedSentence;
+ RulePhrase m_source, m_target;
+ float m_count;
+
+ Alignments m_alignments;
+
+ // in source order
+ std::vector<const NonTerm*> m_nonterms;
+
+ bool m_isValid, m_canRecurse;
+
+ void CreateSource();
+ void CreateAlignments();
+ void CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords);
+ void CreateAlignments(int sourcePos, const RuleSymbol *targetSought);
+
+ bool ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const;
+ int GetScope(const Parameter &params) const;
+
+ void NonTermContext(int sourceTarget, int factors, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const;
+ // sourceTarget: 1 = source, 2 = target
+
+ void NonTermContextFactor(int factor, const Word &word, std::ostream &out) const;
+
+};
+
diff --git a/phrase-extract/extract-mixed-syntax/RulePhrase.cpp b/phrase-extract/extract-mixed-syntax/RulePhrase.cpp
new file mode 100644
index 000000000..5c629168b
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/RulePhrase.cpp
@@ -0,0 +1,50 @@
+/*
+ * RulePhrase.cpp
+ *
+ * Created on: 26 Feb 2014
+ * Author: hieu
+ */
+
+#include <sstream>
+#include "RulePhrase.h"
+#include "RuleSymbol.h"
+
+using namespace std;
+
+extern bool g_debug;
+
+int RulePhrase::Compare(const RulePhrase &other) const
+{
+ if (GetSize() != other.GetSize()) {
+ return GetSize() < other.GetSize() ? -1 : +1;
+ }
+
+ for (size_t i = 0; i < m_coll.size(); ++i) {
+ const RuleSymbol &symbol = *m_coll[i];
+ const RuleSymbol &otherSymbol = *other.m_coll[i];
+ int compare = symbol.Compare(otherSymbol);
+
+ if (compare) {
+ return compare;
+ }
+ }
+
+ return 0;
+}
+
+void RulePhrase::Output(std::ostream &out) const
+{
+ for (size_t i = 0; i < m_coll.size(); ++i) {
+ const RuleSymbol &symbol = *m_coll[i];
+ symbol.Output(out);
+ out << " ";
+ }
+}
+
+std::string RulePhrase::Debug() const
+{
+ std::stringstream out;
+ Output(out);
+ return out.str();
+}
+
diff --git a/phrase-extract/extract-mixed-syntax/RulePhrase.h b/phrase-extract/extract-mixed-syntax/RulePhrase.h
new file mode 100644
index 000000000..412169b74
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/RulePhrase.h
@@ -0,0 +1,49 @@
+/*
+ * RulePhrase.h
+ *
+ * Created on: 26 Feb 2014
+ * Author: hieu
+ */
+
+#ifndef RULEPHRASE_H_
+#define RULEPHRASE_H_
+
+#include <vector>
+#include <cstddef>
+#include <iostream>
+
+class RuleSymbol;
+
+// a phrase of terms and non-terms for 1 side of a rule
+class RulePhrase
+{
+public:
+ typedef std::vector<const RuleSymbol*> Coll;
+ Coll m_coll;
+
+ size_t GetSize() const
+ { return m_coll.size(); }
+
+ void Add(const RuleSymbol *symbol)
+ {
+ m_coll.push_back(symbol);
+ }
+
+ const RuleSymbol* operator[](size_t index) const {
+ return m_coll[index];
+ }
+
+ const RuleSymbol* Front() const {
+ return m_coll.front();
+ }
+ const RuleSymbol* Back() const {
+ return m_coll.back();
+ }
+
+ int Compare(const RulePhrase &other) const;
+
+ void Output(std::ostream &out) const;
+ std::string Debug() const;
+};
+
+#endif /* RULEPHRASE_H_ */
diff --git a/phrase-extract/extract-mixed-syntax/RuleSymbol.cpp b/phrase-extract/extract-mixed-syntax/RuleSymbol.cpp
new file mode 100644
index 000000000..933ffc9c2
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/RuleSymbol.cpp
@@ -0,0 +1,36 @@
+/*
+ * RuleSymbol.cpp
+ *
+ * Created on: 21 Feb 2014
+ * Author: hieu
+ */
+
+#include "RuleSymbol.h"
+
+using namespace std;
+
+RuleSymbol::RuleSymbol() {
+ // TODO Auto-generated constructor stub
+
+}
+
+RuleSymbol::~RuleSymbol() {
+ // TODO Auto-generated destructor stub
+}
+
+int RuleSymbol::Compare(const RuleSymbol &other) const
+{
+ if (IsNonTerm() != other.IsNonTerm()) {
+ return IsNonTerm() ? -1 : +1;
+ }
+
+ string str = GetString();
+ string otherStr = other.GetString();
+
+ if (str == otherStr) {
+ return 0;
+ }
+ else {
+ return (str < otherStr) ? -1 : +1;
+ }
+}
diff --git a/phrase-extract/extract-mixed-syntax/RuleSymbol.h b/phrase-extract/extract-mixed-syntax/RuleSymbol.h
new file mode 100644
index 000000000..c292fcc0d
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/RuleSymbol.h
@@ -0,0 +1,31 @@
+/*
+ * RuleSymbol.h
+ *
+ * Created on: 21 Feb 2014
+ * Author: hieu
+ */
+
+#ifndef RULESYMBOL_H_
+#define RULESYMBOL_H_
+
+#include <iostream>
+#include <string>
+
+// base class - terminal or non-term
+class RuleSymbol {
+public:
+ RuleSymbol();
+ virtual ~RuleSymbol();
+
+ virtual bool IsNonTerm() const = 0;
+
+ virtual std::string Debug() const = 0;
+ virtual void Output(std::ostream &out) const = 0;
+
+ virtual std::string GetString() const = 0;
+
+ int Compare(const RuleSymbol &other) const;
+
+};
+
+#endif /* RULESYMBOL_H_ */
diff --git a/phrase-extract/extract-mixed-syntax/Rules.cpp b/phrase-extract/extract-mixed-syntax/Rules.cpp
new file mode 100644
index 000000000..1b93430e2
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Rules.cpp
@@ -0,0 +1,227 @@
+/*
+ * Rules.cpp
+ *
+ * Created on: 20 Feb 2014
+ * Author: hieu
+ */
+
+#include <sstream>
+#include "Rules.h"
+#include "ConsistentPhrase.h"
+#include "ConsistentPhrases.h"
+#include "AlignedSentence.h"
+#include "Rule.h"
+#include "Parameter.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+extern bool g_debug;
+
+Rules::Rules(const AlignedSentence &alignedSentence)
+:m_alignedSentence(alignedSentence)
+{
+}
+
+Rules::~Rules() {
+ Moses::RemoveAllInColl(m_keepRules);
+}
+
+void Rules::CreateRules(const ConsistentPhrase &cp,
+ const Parameter &params)
+{
+ if (params.hieroSourceLHS) {
+ const NonTerm &nonTerm = cp.GetHieroNonTerm();
+ CreateRule(nonTerm, params);
+ }
+ else {
+ const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
+ for (size_t i = 0; i < nonTerms.size(); ++i) {
+ const NonTerm &nonTerm = nonTerms[i];
+ CreateRule(nonTerm, params);
+ }
+ }
+}
+
+void Rules::CreateRule(const NonTerm &nonTerm,
+ const Parameter &params)
+{
+ Rule *rule = new Rule(nonTerm, m_alignedSentence);
+
+ rule->Prevalidate(params);
+ rule->CreateTarget(params);
+
+
+ if (rule->CanRecurse()) {
+ Extend(*rule, params);
+ }
+
+ if (rule->IsValid()) {
+ m_keepRules.insert(rule);
+ }
+ else {
+ delete rule;
+ }
+
+}
+
+void Rules::Extend(const Parameter &params)
+{
+ const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
+
+ size_t size = m_alignedSentence.GetPhrase(Moses::Input).size();
+ for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
+ for (size_t sourceEnd = sourceStart; sourceEnd < size; ++sourceEnd) {
+ const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
+
+ ConsistentPhrases::Coll::const_iterator iter;
+ for (iter = cps.begin(); iter != cps.end(); ++iter) {
+ const ConsistentPhrase &cp = **iter;
+ CreateRules(cp, params);
+ }
+ }
+ }
+}
+
+void Rules::Extend(const Rule &rule, const Parameter &params)
+{
+ const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
+ int sourceMin = rule.GetNextSourcePosForNonTerm();
+
+ int ruleStart = rule.GetConsistentPhrase().corners[0];
+ int ruleEnd = rule.GetConsistentPhrase().corners[1];
+
+ for (int sourceStart = sourceMin; sourceStart <= ruleEnd; ++sourceStart) {
+ for (int sourceEnd = sourceStart; sourceEnd <= ruleEnd; ++sourceEnd) {
+ if (sourceStart == ruleStart && sourceEnd == ruleEnd) {
+ // don't cover whole rule with 1 non-term
+ continue;
+ }
+
+ const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
+ Extend(rule, cps, params);
+ }
+ }
+}
+
+void Rules::Extend(const Rule &rule, const ConsistentPhrases::Coll &cps, const Parameter &params)
+{
+ ConsistentPhrases::Coll::const_iterator iter;
+ for (iter = cps.begin(); iter != cps.end(); ++iter) {
+ const ConsistentPhrase &cp = **iter;
+ Extend(rule, cp, params);
+ }
+}
+
+void Rules::Extend(const Rule &rule, const ConsistentPhrase &cp, const Parameter &params)
+{
+ const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
+ for (size_t i = 0; i < nonTerms.size(); ++i) {
+ const NonTerm &nonTerm = nonTerms[i];
+
+ Rule *newRule = new Rule(rule, nonTerm);
+ newRule->Prevalidate(params);
+ newRule->CreateTarget(params);
+
+ if (newRule->CanRecurse()) {
+ // recursively extend
+ Extend(*newRule, params);
+ }
+
+ if (newRule->IsValid()) {
+ m_keepRules.insert(newRule);
+ }
+ else {
+ delete newRule;
+ }
+ }
+}
+
+std::string Rules::Debug() const
+{
+ stringstream out;
+
+ std::set<Rule*>::const_iterator iter;
+ out << "m_keepRules:" << endl;
+ for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
+ const Rule &rule = **iter;
+ out << rule.Debug() << endl;
+ }
+
+ return out.str();
+}
+
+void Rules::Output(std::ostream &out, bool forward, const Parameter &params) const
+{
+ std::set<Rule*, CompareRules>::const_iterator iter;
+ for (iter = m_mergeRules.begin(); iter != m_mergeRules.end(); ++iter) {
+ const Rule &rule = **iter;
+ rule.Output(out, forward, params);
+ out << endl;
+ }
+}
+
+void Rules::Consolidate(const Parameter &params)
+{
+ if (params.fractionalCounting) {
+ CalcFractionalCount();
+ }
+ else {
+ std::set<Rule*>::iterator iter;
+ for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
+ Rule &rule = **iter;
+ rule.SetCount(1);
+ }
+ }
+
+ MergeRules(params);
+}
+
+void Rules::MergeRules(const Parameter &params)
+{
+ typedef std::set<Rule*, CompareRules> MergeRules;
+
+ std::set<Rule*>::const_iterator iterOrig;
+ for (iterOrig = m_keepRules.begin(); iterOrig != m_keepRules.end(); ++iterOrig) {
+ Rule *origRule = *iterOrig;
+
+ pair<MergeRules::iterator, bool> inserted = m_mergeRules.insert(origRule);
+ if (!inserted.second) {
+ // already there, just add count
+ Rule &rule = **inserted.first;
+ float newCount = rule.GetCount() + origRule->GetCount();
+ rule.SetCount(newCount);
+ }
+ }
+}
+
+void Rules::CalcFractionalCount()
+{
+ typedef std::set<Rule*> RuleColl;
+ typedef std::map<const ConsistentPhrase*, RuleColl> RuleByConsistentPhrase;
+ RuleByConsistentPhrase allRules;
+
+ // sort by source AND target ranges
+ std::set<Rule*>::const_iterator iter;
+ for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
+ Rule *rule = *iter;
+ const ConsistentPhrase &cp = rule->GetConsistentPhrase();
+ RuleColl &ruleColl = allRules[&cp];
+ ruleColl.insert(rule);
+ }
+
+ // fractional count
+ RuleByConsistentPhrase::iterator iterOuter;
+ for (iterOuter = allRules.begin(); iterOuter != allRules.end(); ++iterOuter) {
+ RuleColl &rules = iterOuter->second;
+
+ RuleColl::iterator iterInner;
+ for (iterInner = rules.begin(); iterInner != rules.end(); ++iterInner) {
+ Rule &rule = **iterInner;
+ rule.SetCount(1.0f / (float) rules.size());
+ }
+ }
+
+}
+
+
diff --git a/phrase-extract/extract-mixed-syntax/Rules.h b/phrase-extract/extract-mixed-syntax/Rules.h
new file mode 100644
index 000000000..6d8cb122d
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Rules.h
@@ -0,0 +1,72 @@
+/*
+ * Rules.h
+ *
+ * Created on: 20 Feb 2014
+ * Author: hieu
+ */
+
+#pragma once
+
+#include <set>
+#include <iostream>
+#include "ConsistentPhrases.h"
+#include "Rule.h"
+
+extern bool g_debug;
+
+class AlignedSentence;
+class Parameter;
+
+struct CompareRules {
+ bool operator()(const Rule *a, const Rule *b)
+ {
+ int compare;
+
+ compare = a->GetPhrase(Moses::Input).Compare(b->GetPhrase(Moses::Input));
+ if (compare) return compare < 0;
+
+ compare = a->GetPhrase(Moses::Output).Compare(b->GetPhrase(Moses::Output));
+ if (compare) return compare < 0;
+
+ if (a->GetAlignments() != b->GetAlignments()) {
+ return a->GetAlignments() < b->GetAlignments();
+ }
+
+ if (a->GetLHS().GetString() != b->GetLHS().GetString()) {
+ return a->GetLHS().GetString() < b->GetLHS().GetString();
+ }
+
+ return false;
+ }
+};
+
+class Rules {
+public:
+ Rules(const AlignedSentence &alignedSentence);
+ virtual ~Rules();
+ void Extend(const Parameter &params);
+ void Consolidate(const Parameter &params);
+
+ std::string Debug() const;
+ void Output(std::ostream &out, bool forward, const Parameter &params) const;
+
+protected:
+ const AlignedSentence &m_alignedSentence;
+ std::set<Rule*> m_keepRules;
+ std::set<Rule*, CompareRules> m_mergeRules;
+
+ void Extend(const Rule &rule, const Parameter &params);
+ void Extend(const Rule &rule, const ConsistentPhrases::Coll &cps, const Parameter &params);
+ void Extend(const Rule &rule, const ConsistentPhrase &cp, const Parameter &params);
+
+ // create original rules
+ void CreateRules(const ConsistentPhrase &cp,
+ const Parameter &params);
+ void CreateRule(const NonTerm &nonTerm,
+ const Parameter &params);
+
+ void MergeRules(const Parameter &params);
+ void CalcFractionalCount();
+
+};
+
diff --git a/phrase-extract/extract-mixed-syntax/SyntaxTree.cpp b/phrase-extract/extract-mixed-syntax/SyntaxTree.cpp
new file mode 100644
index 000000000..472444e7c
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/SyntaxTree.cpp
@@ -0,0 +1,47 @@
+#include <cassert>
+#include <iostream>
+#include "SyntaxTree.h"
+#include "Parameter.h"
+
+using namespace std;
+
+void SyntaxTree::Add(int startPos, int endPos, const std::string &label, const Parameter &params)
+{
+ //cerr << "add " << label << " to " << "[" << startPos << "-" << endPos << "]" << endl;
+
+ Range range(startPos, endPos);
+ Labels &labels = m_coll[range];
+
+ bool add = true;
+ if (labels.size()) {
+ if (params.multiLabel == 1) {
+ // delete the label in collection and add new
+ assert(labels.size() == 1);
+ labels.clear();
+ }
+ else if (params.multiLabel == 2) {
+ // ignore this label
+ add = false;
+ }
+ }
+
+ if (add) {
+ labels.push_back(label);
+ }
+}
+
+void SyntaxTree::AddToAll(const std::string &label)
+{
+ Coll::iterator iter;
+ for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
+ Labels &labels = iter->second;
+ labels.push_back(label);
+ }
+}
+
+const SyntaxTree::Labels &SyntaxTree::Find(int startPos, int endPos) const
+{
+ Coll::const_iterator iter;
+ iter = m_coll.find(Range(startPos, endPos));
+ return (iter == m_coll.end()) ? m_defaultLabels : iter->second;
+}
diff --git a/phrase-extract/extract-mixed-syntax/SyntaxTree.h b/phrase-extract/extract-mixed-syntax/SyntaxTree.h
new file mode 100644
index 000000000..58f718151
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/SyntaxTree.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <vector>
+#include <map>
+#include <string>
+
+class Parameter;
+
+class SyntaxTree
+{
+public:
+ typedef std::pair<int, int> Range;
+ typedef std::vector<std::string> Labels;
+ typedef std::map<Range, Labels> Coll;
+
+ void Add(int startPos, int endPos, const std::string &label, const Parameter &params);
+ void AddToAll(const std::string &label);
+
+ const Labels &Find(int startPos, int endPos) const;
+
+ void SetHieroLabel(const std::string &label) {
+ m_defaultLabels.push_back(label);
+ }
+
+
+protected:
+
+ Coll m_coll;
+ Labels m_defaultLabels;
+};
+
+
diff --git a/phrase-extract/extract-mixed-syntax/Word.cpp b/phrase-extract/extract-mixed-syntax/Word.cpp
new file mode 100644
index 000000000..8ce4f76c6
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Word.cpp
@@ -0,0 +1,68 @@
+/*
+ * Word.cpp
+ *
+ * Created on: 18 Feb 2014
+ * Author: s0565741
+ */
+#include <limits>
+#include "Word.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+Word::Word(int pos, const std::string &str)
+:m_pos(pos)
+,m_str(str)
+{
+ // TODO Auto-generated constructor stub
+
+}
+
+Word::~Word() {
+ // TODO Auto-generated destructor stub
+}
+
+void Word::AddAlignment(const Word *other)
+{
+ m_alignment.insert(other);
+}
+
+std::set<int> Word::GetAlignmentIndex() const
+{
+ std::set<int> ret;
+
+ std::set<const Word *>::const_iterator iter;
+ for (iter = m_alignment.begin(); iter != m_alignment.end(); ++iter) {
+ const Word &otherWord = **iter;
+ int otherPos = otherWord.GetPos();
+ ret.insert(otherPos);
+ }
+
+ return ret;
+}
+
+void Word::Output(std::ostream &out) const
+{
+ out << m_str;
+}
+
+std::string Word::Debug() const
+{
+ return m_str;
+}
+
+int Word::CompareString(const Word &other) const
+{
+ return m_str.compare(other.m_str);
+}
+
+std::string Word::GetString(int factor) const
+{
+ vector<string> toks;
+ Moses::Tokenize(toks, m_str, "|");
+
+ assert(factor < toks.size());
+ return toks[factor];
+}
+
+
diff --git a/phrase-extract/extract-mixed-syntax/Word.h b/phrase-extract/extract-mixed-syntax/Word.h
new file mode 100644
index 000000000..54419ceb0
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/Word.h
@@ -0,0 +1,49 @@
+/*
+ * Word.h
+ *
+ * Created on: 18 Feb 2014
+ * Author: s0565741
+ */
+#pragma once
+
+#include <string>
+#include <set>
+#include "RuleSymbol.h"
+
+// a terminal
+class Word : public RuleSymbol
+{
+public:
+ Word(const Word&); // do not implement
+ Word(int pos, const std::string &str);
+ virtual ~Word();
+
+ virtual bool IsNonTerm() const
+ { return false; }
+
+ std::string GetString() const
+ { return m_str; }
+
+ std::string GetString(int factor) const;
+
+ int GetPos() const
+ { return m_pos; }
+
+ void AddAlignment(const Word *other);
+
+ const std::set<const Word *> &GetAlignment() const
+ { return m_alignment; }
+
+ std::set<int> GetAlignmentIndex() const;
+
+ void Output(std::ostream &out) const;
+ std::string Debug() const;
+
+ int CompareString(const Word &other) const;
+
+protected:
+ int m_pos; // original position in sentence, NOT in lattice
+ std::string m_str;
+ std::set<const Word *> m_alignment;
+};
+
diff --git a/contrib/other-builds/extract-mixed-syntax/gzfilebuf.h b/phrase-extract/extract-mixed-syntax/gzfilebuf.h
index 885c661f0..885c661f0 100644
--- a/contrib/other-builds/extract-mixed-syntax/gzfilebuf.h
+++ b/phrase-extract/extract-mixed-syntax/gzfilebuf.h
diff --git a/phrase-extract/extract-mixed-syntax/pugiconfig.hpp b/phrase-extract/extract-mixed-syntax/pugiconfig.hpp
new file mode 100644
index 000000000..c2196715c
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/pugiconfig.hpp
@@ -0,0 +1,69 @@
+/**
+ * pugixml parser - version 1.2
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef HEADER_PUGICONFIG_HPP
+#define HEADER_PUGICONFIG_HPP
+
+// Uncomment this to enable wchar_t mode
+// #define PUGIXML_WCHAR_MODE
+
+// Uncomment this to disable XPath
+// #define PUGIXML_NO_XPATH
+
+// Uncomment this to disable STL
+// #define PUGIXML_NO_STL
+
+// Uncomment this to disable exceptions
+// #define PUGIXML_NO_EXCEPTIONS
+
+// Set this to control attributes for public classes/functions, i.e.:
+// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
+// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
+// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
+// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
+
+// Uncomment this to switch to header-only version
+// #define PUGIXML_HEADER_ONLY
+// #include "pugixml.cpp"
+
+// Tune these constants to adjust memory-related behavior
+// #define PUGIXML_MEMORY_PAGE_SIZE 32768
+// #define PUGIXML_MEMORY_OUTPUT_STACK 10240
+// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
+
+#endif
+
+/**
+ * Copyright (c) 2006-2012 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/phrase-extract/extract-mixed-syntax/pugixml.cpp b/phrase-extract/extract-mixed-syntax/pugixml.cpp
new file mode 100644
index 000000000..4035ab1cf
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/pugixml.cpp
@@ -0,0 +1,10250 @@
+/**
+ * pugixml parser - version 1.2
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef SOURCE_PUGIXML_CPP
+#define SOURCE_PUGIXML_CPP
+
+#include "pugixml.hpp"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <wchar.h>
+
+#ifndef PUGIXML_NO_XPATH
+# include <math.h>
+# include <float.h>
+# ifdef PUGIXML_NO_EXCEPTIONS
+# include <setjmp.h>
+# endif
+#endif
+
+#ifndef PUGIXML_NO_STL
+# include <istream>
+# include <ostream>
+# include <string>
+#endif
+
+// For placement new
+#include <new>
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable: 4127) // conditional expression is constant
+# pragma warning(disable: 4324) // structure was padded due to __declspec(align())
+# pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable
+# pragma warning(disable: 4702) // unreachable code
+# pragma warning(disable: 4996) // this function or variable may be unsafe
+# pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged
+#endif
+
+#ifdef __INTEL_COMPILER
+# pragma warning(disable: 177) // function was declared but never referenced
+# pragma warning(disable: 279) // controlling expression is constant
+# pragma warning(disable: 1478 1786) // function was declared "deprecated"
+# pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
+#endif
+
+#if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY)
+# pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away
+#endif
+
+#ifdef __BORLANDC__
+# pragma option push
+# pragma warn -8008 // condition is always false
+# pragma warn -8066 // unreachable code
+#endif
+
+#ifdef __SNC__
+// Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug
+# pragma diag_suppress=178 // function was declared but never referenced
+# pragma diag_suppress=237 // controlling expression is constant
+#endif
+
+// Inlining controls
+#if defined(_MSC_VER) && _MSC_VER >= 1300
+# define PUGI__NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__)
+# define PUGI__NO_INLINE __attribute__((noinline))
+#else
+# define PUGI__NO_INLINE
+#endif
+
+// Simple static assertion
+#define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; }
+
+// Digital Mars C++ bug workaround for passing char loaded from memory via stack
+#ifdef __DMC__
+# define PUGI__DMC_VOLATILE volatile
+#else
+# define PUGI__DMC_VOLATILE
+#endif
+
+// Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all)
+#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST)
+using std::memcpy;
+using std::memmove;
+#endif
+
+// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features
+#if defined(_MSC_VER) && !defined(__S3E__)
+# define PUGI__MSVC_CRT_VERSION _MSC_VER
+#endif
+
+#ifdef PUGIXML_HEADER_ONLY
+# define PUGI__NS_BEGIN namespace pugi { namespace impl {
+# define PUGI__NS_END } }
+# define PUGI__FN inline
+# define PUGI__FN_NO_INLINE inline
+#else
+# if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces
+# define PUGI__NS_BEGIN namespace pugi { namespace impl {
+# define PUGI__NS_END } }
+# else
+# define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace {
+# define PUGI__NS_END } } }
+# endif
+# define PUGI__FN
+# define PUGI__FN_NO_INLINE PUGI__NO_INLINE
+#endif
+
+// uintptr_t
+#if !defined(_MSC_VER) || _MSC_VER >= 1600
+# include <stdint.h>
+#else
+# ifndef _UINTPTR_T_DEFINED
+// No native uintptr_t in MSVC6 and in some WinCE versions
+typedef size_t uintptr_t;
+#define _UINTPTR_T_DEFINED
+# endif
+PUGI__NS_BEGIN
+ typedef unsigned __int8 uint8_t;
+ typedef unsigned __int16 uint16_t;
+ typedef unsigned __int32 uint32_t;
+PUGI__NS_END
+#endif
+
+// Memory allocation
+PUGI__NS_BEGIN
+ PUGI__FN void* default_allocate(size_t size)
+ {
+ return malloc(size);
+ }
+
+ PUGI__FN void default_deallocate(void* ptr)
+ {
+ free(ptr);
+ }
+
+ template <typename T>
+ struct xml_memory_management_function_storage
+ {
+ static allocation_function allocate;
+ static deallocation_function deallocate;
+ };
+
+ template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
+ template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
+
+ typedef xml_memory_management_function_storage<int> xml_memory;
+PUGI__NS_END
+
+// String utilities
+PUGI__NS_BEGIN
+ // Get string length
+ PUGI__FN size_t strlength(const char_t* s)
+ {
+ assert(s);
+
+ #ifdef PUGIXML_WCHAR_MODE
+ return wcslen(s);
+ #else
+ return strlen(s);
+ #endif
+ }
+
+ // Compare two strings
+ PUGI__FN bool strequal(const char_t* src, const char_t* dst)
+ {
+ assert(src && dst);
+
+ #ifdef PUGIXML_WCHAR_MODE
+ return wcscmp(src, dst) == 0;
+ #else
+ return strcmp(src, dst) == 0;
+ #endif
+ }
+
+ // Compare lhs with [rhs_begin, rhs_end)
+ PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count)
+ {
+ for (size_t i = 0; i < count; ++i)
+ if (lhs[i] != rhs[i])
+ return false;
+
+ return lhs[count] == 0;
+ }
+
+#ifdef PUGIXML_WCHAR_MODE
+ // Convert string to wide string, assuming all symbols are ASCII
+ PUGI__FN void widen_ascii(wchar_t* dest, const char* source)
+ {
+ for (const char* i = source; *i; ++i) *dest++ = *i;
+ *dest = 0;
+ }
+#endif
+PUGI__NS_END
+
+#if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH)
+// auto_ptr-like buffer holder for exception recovery
+PUGI__NS_BEGIN
+ struct buffer_holder
+ {
+ void* data;
+ void (*deleter)(void*);
+
+ buffer_holder(void* data_, void (*deleter_)(void*)): data(data_), deleter(deleter_)
+ {
+ }
+
+ ~buffer_holder()
+ {
+ if (data) deleter(data);
+ }
+
+ void* release()
+ {
+ void* result = data;
+ data = 0;
+ return result;
+ }
+ };
+PUGI__NS_END
+#endif
+
+PUGI__NS_BEGIN
+ static const size_t xml_memory_page_size =
+ #ifdef PUGIXML_MEMORY_PAGE_SIZE
+ PUGIXML_MEMORY_PAGE_SIZE
+ #else
+ 32768
+ #endif
+ ;
+
+ static const uintptr_t xml_memory_page_alignment = 32;
+ static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1);
+ static const uintptr_t xml_memory_page_name_allocated_mask = 16;
+ static const uintptr_t xml_memory_page_value_allocated_mask = 8;
+ static const uintptr_t xml_memory_page_type_mask = 7;
+
+ struct xml_allocator;
+
+ struct xml_memory_page
+ {
+ static xml_memory_page* construct(void* memory)
+ {
+ if (!memory) return 0; //$ redundant, left for performance
+
+ xml_memory_page* result = static_cast<xml_memory_page*>(memory);
+
+ result->allocator = 0;
+ result->memory = 0;
+ result->prev = 0;
+ result->next = 0;
+ result->busy_size = 0;
+ result->freed_size = 0;
+
+ return result;
+ }
+
+ xml_allocator* allocator;
+
+ void* memory;
+
+ xml_memory_page* prev;
+ xml_memory_page* next;
+
+ size_t busy_size;
+ size_t freed_size;
+
+ char data[1];
+ };
+
+ struct xml_memory_string_header
+ {
+ uint16_t page_offset; // offset from page->data
+ uint16_t full_size; // 0 if string occupies whole page
+ };
+
+ struct xml_allocator
+ {
+ xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size)
+ {
+ }
+
+ xml_memory_page* allocate_page(size_t data_size)
+ {
+ size_t size = offsetof(xml_memory_page, data) + data_size;
+
+ // allocate block with some alignment, leaving memory for worst-case padding
+ void* memory = xml_memory::allocate(size + xml_memory_page_alignment);
+ if (!memory) return 0;
+
+ // align upwards to page boundary
+ void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(memory) + (xml_memory_page_alignment - 1)) & ~(xml_memory_page_alignment - 1));
+
+ // prepare page structure
+ xml_memory_page* page = xml_memory_page::construct(page_memory);
+
+ page->memory = memory;
+ page->allocator = _root->allocator;
+
+ return page;
+ }
+
+ static void deallocate_page(xml_memory_page* page)
+ {
+ xml_memory::deallocate(page->memory);
+ }
+
+ void* allocate_memory_oob(size_t size, xml_memory_page*& out_page);
+
+ void* allocate_memory(size_t size, xml_memory_page*& out_page)
+ {
+ if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page);
+
+ void* buf = _root->data + _busy_size;
+
+ _busy_size += size;
+
+ out_page = _root;
+
+ return buf;
+ }
+
+ void deallocate_memory(void* ptr, size_t size, xml_memory_page* page)
+ {
+ if (page == _root) page->busy_size = _busy_size;
+
+ assert(ptr >= page->data && ptr < page->data + page->busy_size);
+ (void)!ptr;
+
+ page->freed_size += size;
+ assert(page->freed_size <= page->busy_size);
+
+ if (page->freed_size == page->busy_size)
+ {
+ if (page->next == 0)
+ {
+ assert(_root == page);
+
+ // top page freed, just reset sizes
+ page->busy_size = page->freed_size = 0;
+ _busy_size = 0;
+ }
+ else
+ {
+ assert(_root != page);
+ assert(page->prev);
+
+ // remove from the list
+ page->prev->next = page->next;
+ page->next->prev = page->prev;
+
+ // deallocate
+ deallocate_page(page);
+ }
+ }
+ }
+
+ char_t* allocate_string(size_t length)
+ {
+ // allocate memory for string and header block
+ size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t);
+
+ // round size up to pointer alignment boundary
+ size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1);
+
+ xml_memory_page* page;
+ xml_memory_string_header* header = static_cast<xml_memory_string_header*>(allocate_memory(full_size, page));
+
+ if (!header) return 0;
+
+ // setup header
+ ptrdiff_t page_offset = reinterpret_cast<char*>(header) - page->data;
+
+ assert(page_offset >= 0 && page_offset < (1 << 16));
+ header->page_offset = static_cast<uint16_t>(page_offset);
+
+ // full_size == 0 for large strings that occupy the whole page
+ assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0));
+ header->full_size = static_cast<uint16_t>(full_size < (1 << 16) ? full_size : 0);
+
+ // round-trip through void* to avoid 'cast increases required alignment of target type' warning
+ // header is guaranteed a pointer-sized alignment, which should be enough for char_t
+ return static_cast<char_t*>(static_cast<void*>(header + 1));
+ }
+
+ void deallocate_string(char_t* string)
+ {
+ // this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings
+ // we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string
+
+ // get header
+ xml_memory_string_header* header = static_cast<xml_memory_string_header*>(static_cast<void*>(string)) - 1;
+
+ // deallocate
+ size_t page_offset = offsetof(xml_memory_page, data) + header->page_offset;
+ xml_memory_page* page = reinterpret_cast<xml_memory_page*>(static_cast<void*>(reinterpret_cast<char*>(header) - page_offset));
+
+ // if full_size == 0 then this string occupies the whole page
+ size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size;
+
+ deallocate_memory(header, full_size, page);
+ }
+
+ xml_memory_page* _root;
+ size_t _busy_size;
+ };
+
+ PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page)
+ {
+ const size_t large_allocation_threshold = xml_memory_page_size / 4;
+
+ xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
+ out_page = page;
+
+ if (!page) return 0;
+
+ if (size <= large_allocation_threshold)
+ {
+ _root->busy_size = _busy_size;
+
+ // insert page at the end of linked list
+ page->prev = _root;
+ _root->next = page;
+ _root = page;
+
+ _busy_size = size;
+ }
+ else
+ {
+ // insert page before the end of linked list, so that it is deleted as soon as possible
+ // the last page is not deleted even if it's empty (see deallocate_memory)
+ assert(_root->prev);
+
+ page->prev = _root->prev;
+ page->next = _root;
+
+ _root->prev->next = page;
+ _root->prev = page;
+ }
+
+ // allocate inside page
+ page->busy_size = size;
+
+ return page->data;
+ }
+PUGI__NS_END
+
+namespace pugi
+{
+ /// A 'name=value' XML attribute structure.
+ struct xml_attribute_struct
+ {
+ /// Default ctor
+ xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast<uintptr_t>(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0)
+ {
+ }
+
+ uintptr_t header;
+
+ char_t* name; ///< Pointer to attribute name.
+ char_t* value; ///< Pointer to attribute value.
+
+ xml_attribute_struct* prev_attribute_c; ///< Previous attribute (cyclic list)
+ xml_attribute_struct* next_attribute; ///< Next attribute
+ };
+
+ /// An XML document tree node.
+ struct xml_node_struct
+ {
+ /// Default ctor
+ /// \param type - node type
+ xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast<uintptr_t>(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
+ {
+ }
+
+ uintptr_t header;
+
+ xml_node_struct* parent; ///< Pointer to parent
+
+ char_t* name; ///< Pointer to element name.
+ char_t* value; ///< Pointer to any associated string data.
+
+ xml_node_struct* first_child; ///< First child
+
+ xml_node_struct* prev_sibling_c; ///< Left brother (cyclic list)
+ xml_node_struct* next_sibling; ///< Right brother
+
+ xml_attribute_struct* first_attribute; ///< First attribute
+ };
+}
+
+PUGI__NS_BEGIN
+ struct xml_document_struct: public xml_node_struct, public xml_allocator
+ {
+ xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0)
+ {
+ }
+
+ const char_t* buffer;
+ };
+
+ inline xml_allocator& get_allocator(const xml_node_struct* node)
+ {
+ assert(node);
+
+ return *reinterpret_cast<xml_memory_page*>(node->header & xml_memory_page_pointer_mask)->allocator;
+ }
+PUGI__NS_END
+
+// Low-level DOM operations
+PUGI__NS_BEGIN
+ inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
+ {
+ xml_memory_page* page;
+ void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page);
+
+ return new (memory) xml_attribute_struct(page);
+ }
+
+ inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
+ {
+ xml_memory_page* page;
+ void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page);
+
+ return new (memory) xml_node_struct(page, type);
+ }
+
+ inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
+ {
+ uintptr_t header = a->header;
+
+ if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name);
+ if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value);
+
+ alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+ }
+
+ inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
+ {
+ uintptr_t header = n->header;
+
+ if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name);
+ if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value);
+
+ for (xml_attribute_struct* attr = n->first_attribute; attr; )
+ {
+ xml_attribute_struct* next = attr->next_attribute;
+
+ destroy_attribute(attr, alloc);
+
+ attr = next;
+ }
+
+ for (xml_node_struct* child = n->first_child; child; )
+ {
+ xml_node_struct* next = child->next_sibling;
+
+ destroy_node(child, alloc);
+
+ child = next;
+ }
+
+ alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+ }
+
+ PUGI__FN_NO_INLINE xml_node_struct* append_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
+ {
+ xml_node_struct* child = allocate_node(alloc, type);
+ if (!child) return 0;
+
+ child->parent = node;
+
+ xml_node_struct* first_child = node->first_child;
+
+ if (first_child)
+ {
+ xml_node_struct* last_child = first_child->prev_sibling_c;
+
+ last_child->next_sibling = child;
+ child->prev_sibling_c = last_child;
+ first_child->prev_sibling_c = child;
+ }
+ else
+ {
+ node->first_child = child;
+ child->prev_sibling_c = child;
+ }
+
+ return child;
+ }
+
+ PUGI__FN_NO_INLINE xml_attribute_struct* append_attribute_ll(xml_node_struct* node, xml_allocator& alloc)
+ {
+ xml_attribute_struct* a = allocate_attribute(alloc);
+ if (!a) return 0;
+
+ xml_attribute_struct* first_attribute = node->first_attribute;
+
+ if (first_attribute)
+ {
+ xml_attribute_struct* last_attribute = first_attribute->prev_attribute_c;
+
+ last_attribute->next_attribute = a;
+ a->prev_attribute_c = last_attribute;
+ first_attribute->prev_attribute_c = a;
+ }
+ else
+ {
+ node->first_attribute = a;
+ a->prev_attribute_c = a;
+ }
+
+ return a;
+ }
+PUGI__NS_END
+
+// Helper classes for code generation
+PUGI__NS_BEGIN
+ struct opt_false
+ {
+ enum { value = 0 };
+ };
+
+ struct opt_true
+ {
+ enum { value = 1 };
+ };
+PUGI__NS_END
+
+// Unicode utilities
+PUGI__NS_BEGIN
+ inline uint16_t endian_swap(uint16_t value)
+ {
+ return static_cast<uint16_t>(((value & 0xff) << 8) | (value >> 8));
+ }
+
+ inline uint32_t endian_swap(uint32_t value)
+ {
+ return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24);
+ }
+
+ struct utf8_counter
+ {
+ typedef size_t value_type;
+
+ static value_type low(value_type result, uint32_t ch)
+ {
+ // U+0000..U+007F
+ if (ch < 0x80) return result + 1;
+ // U+0080..U+07FF
+ else if (ch < 0x800) return result + 2;
+ // U+0800..U+FFFF
+ else return result + 3;
+ }
+
+ static value_type high(value_type result, uint32_t)
+ {
+ // U+10000..U+10FFFF
+ return result + 4;
+ }
+ };
+
+ struct utf8_writer
+ {
+ typedef uint8_t* value_type;
+
+ static value_type low(value_type result, uint32_t ch)
+ {
+ // U+0000..U+007F
+ if (ch < 0x80)
+ {
+ *result = static_cast<uint8_t>(ch);
+ return result + 1;
+ }
+ // U+0080..U+07FF
+ else if (ch < 0x800)
+ {
+ result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
+ result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+ return result + 2;
+ }
+ // U+0800..U+FFFF
+ else
+ {
+ result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
+ result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+ result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+ return result + 3;
+ }
+ }
+
+ static value_type high(value_type result, uint32_t ch)
+ {
+ // U+10000..U+10FFFF
+ result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
+ result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
+ result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+ result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+ return result + 4;
+ }
+
+ static value_type any(value_type result, uint32_t ch)
+ {
+ return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+ }
+ };
+
+ struct utf16_counter
+ {
+ typedef size_t value_type;
+
+ static value_type low(value_type result, uint32_t)
+ {
+ return result + 1;
+ }
+
+ static value_type high(value_type result, uint32_t)
+ {
+ return result + 2;
+ }
+ };
+
+ struct utf16_writer
+ {
+ typedef uint16_t* value_type;
+
+ static value_type low(value_type result, uint32_t ch)
+ {
+ *result = static_cast<uint16_t>(ch);
+
+ return result + 1;
+ }
+
+ static value_type high(value_type result, uint32_t ch)
+ {
+ uint32_t msh = static_cast<uint32_t>(ch - 0x10000) >> 10;
+ uint32_t lsh = static_cast<uint32_t>(ch - 0x10000) & 0x3ff;
+
+ result[0] = static_cast<uint16_t>(0xD800 + msh);
+ result[1] = static_cast<uint16_t>(0xDC00 + lsh);
+
+ return result + 2;
+ }
+
+ static value_type any(value_type result, uint32_t ch)
+ {
+ return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+ }
+ };
+
+ struct utf32_counter
+ {
+ typedef size_t value_type;
+
+ static value_type low(value_type result, uint32_t)
+ {
+ return result + 1;
+ }
+
+ static value_type high(value_type result, uint32_t)
+ {
+ return result + 1;
+ }
+ };
+
+ struct utf32_writer
+ {
+ typedef uint32_t* value_type;
+
+ static value_type low(value_type result, uint32_t ch)
+ {
+ *result = ch;
+
+ return result + 1;
+ }
+
+ static value_type high(value_type result, uint32_t ch)
+ {
+ *result = ch;
+
+ return result + 1;
+ }
+
+ static value_type any(value_type result, uint32_t ch)
+ {
+ *result = ch;
+
+ return result + 1;
+ }
+ };
+
+ struct latin1_writer
+ {
+ typedef uint8_t* value_type;
+
+ static value_type low(value_type result, uint32_t ch)
+ {
+ *result = static_cast<uint8_t>(ch > 255 ? '?' : ch);
+
+ return result + 1;
+ }
+
+ static value_type high(value_type result, uint32_t ch)
+ {
+ (void)ch;
+
+ *result = '?';
+
+ return result + 1;
+ }
+ };
+
+ template <size_t size> struct wchar_selector;
+
+ template <> struct wchar_selector<2>
+ {
+ typedef uint16_t type;
+ typedef utf16_counter counter;
+ typedef utf16_writer writer;
+ };
+
+ template <> struct wchar_selector<4>
+ {
+ typedef uint32_t type;
+ typedef utf32_counter counter;
+ typedef utf32_writer writer;
+ };
+
+ typedef wchar_selector<sizeof(wchar_t)>::counter wchar_counter;
+ typedef wchar_selector<sizeof(wchar_t)>::writer wchar_writer;
+
+ template <typename Traits, typename opt_swap = opt_false> struct utf_decoder
+ {
+ static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+ {
+ const uint8_t utf8_byte_mask = 0x3f;
+
+ while (size)
+ {
+ uint8_t lead = *data;
+
+ // 0xxxxxxx -> U+0000..U+007F
+ if (lead < 0x80)
+ {
+ result = Traits::low(result, lead);
+ data += 1;
+ size -= 1;
+
+ // process aligned single-byte (ascii) blocks
+ if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
+ {
+ // round-trip through void* to silence 'cast increases required alignment of target type' warnings
+ while (size >= 4 && (*static_cast<const uint32_t*>(static_cast<const void*>(data)) & 0x80808080) == 0)
+ {
+ result = Traits::low(result, data[0]);
+ result = Traits::low(result, data[1]);
+ result = Traits::low(result, data[2]);
+ result = Traits::low(result, data[3]);
+ data += 4;
+ size -= 4;
+ }
+ }
+ }
+ // 110xxxxx -> U+0080..U+07FF
+ else if (static_cast<unsigned int>(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
+ {
+ result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
+ data += 2;
+ size -= 2;
+ }
+ // 1110xxxx -> U+0800-U+FFFF
+ else if (static_cast<unsigned int>(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
+ {
+ result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
+ data += 3;
+ size -= 3;
+ }
+ // 11110xxx -> U+10000..U+10FFFF
+ else if (static_cast<unsigned int>(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
+ {
+ result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
+ data += 4;
+ size -= 4;
+ }
+ // 10xxxxxx or 11111xxx -> invalid
+ else
+ {
+ data += 1;
+ size -= 1;
+ }
+ }
+
+ return result;
+ }
+
+ static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result)
+ {
+ const uint16_t* end = data + size;
+
+ while (data < end)
+ {
+ uint16_t lead = opt_swap::value ? endian_swap(*data) : *data;
+
+ // U+0000..U+D7FF
+ if (lead < 0xD800)
+ {
+ result = Traits::low(result, lead);
+ data += 1;
+ }
+ // U+E000..U+FFFF
+ else if (static_cast<unsigned int>(lead - 0xE000) < 0x2000)
+ {
+ result = Traits::low(result, lead);
+ data += 1;
+ }
+ // surrogate pair lead
+ else if (static_cast<unsigned int>(lead - 0xD800) < 0x400 && data + 1 < end)
+ {
+ uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
+
+ if (static_cast<unsigned int>(next - 0xDC00) < 0x400)
+ {
+ result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
+ data += 2;
+ }
+ else
+ {
+ data += 1;
+ }
+ }
+ else
+ {
+ data += 1;
+ }
+ }
+
+ return result;
+ }
+
+ static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result)
+ {
+ const uint32_t* end = data + size;
+
+ while (data < end)
+ {
+ uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
+
+ // U+0000..U+FFFF
+ if (lead < 0x10000)
+ {
+ result = Traits::low(result, lead);
+ data += 1;
+ }
+ // U+10000..U+10FFFF
+ else
+ {
+ result = Traits::high(result, lead);
+ data += 1;
+ }
+ }
+
+ return result;
+ }
+
+ static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+ {
+ for (size_t i = 0; i < size; ++i)
+ {
+ result = Traits::low(result, data[i]);
+ }
+
+ return result;
+ }
+
+ static inline typename Traits::value_type decode_wchar_block_impl(const uint16_t* data, size_t size, typename Traits::value_type result)
+ {
+ return decode_utf16_block(data, size, result);
+ }
+
+ static inline typename Traits::value_type decode_wchar_block_impl(const uint32_t* data, size_t size, typename Traits::value_type result)
+ {
+ return decode_utf32_block(data, size, result);
+ }
+
+ static inline typename Traits::value_type decode_wchar_block(const wchar_t* data, size_t size, typename Traits::value_type result)
+ {
+ return decode_wchar_block_impl(reinterpret_cast<const wchar_selector<sizeof(wchar_t)>::type*>(data), size, result);
+ }
+ };
+
+ template <typename T> PUGI__FN void convert_utf_endian_swap(T* result, const T* data, size_t length)
+ {
+ for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]);
+ }
+
+#ifdef PUGIXML_WCHAR_MODE
+ PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
+ {
+ for (size_t i = 0; i < length; ++i) result[i] = static_cast<wchar_t>(endian_swap(static_cast<wchar_selector<sizeof(wchar_t)>::type>(data[i])));
+ }
+#endif
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+ enum chartype_t
+ {
+ ct_parse_pcdata = 1, // \0, &, \r, <
+ ct_parse_attr = 2, // \0, &, \r, ', "
+ ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab
+ ct_space = 8, // \r, \n, space, tab
+ ct_parse_cdata = 16, // \0, ], >, \r
+ ct_parse_comment = 32, // \0, -, >, \r
+ ct_symbol = 64, // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
+ ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, :
+ };
+
+ static const unsigned char chartype_table[256] =
+ {
+ 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
+ 8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63
+ 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95
+ 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 96-111
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, // 112-127
+
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 128+
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+ };
+
+ enum chartypex_t
+ {
+ ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
+ ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, "
+ ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _
+ ctx_digit = 8, // 0-9
+ ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
+ };
+
+ static const unsigned char chartypex_table[256] =
+ {
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31
+ 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63
+
+ 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 64-79
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 20, // 80-95
+ 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 96-111
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, // 112-127
+
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 128+
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+ };
+
+#ifdef PUGIXML_WCHAR_MODE
+ #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast<unsigned int>(c) < 128 ? table[static_cast<unsigned int>(c)] : table[128]) & (ct))
+#else
+ #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
+#endif
+
+ #define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table)
+ #define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
+
+ PUGI__FN bool is_little_endian()
+ {
+ unsigned int ui = 1;
+
+ return *reinterpret_cast<unsigned char*>(&ui) == 1;
+ }
+
+ PUGI__FN xml_encoding get_wchar_encoding()
+ {
+ PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
+
+ if (sizeof(wchar_t) == 2)
+ return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+ else
+ return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+ }
+
+ PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3)
+ {
+ // look for BOM in first few bytes
+ if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
+ if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
+ if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be;
+ if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le;
+ if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8;
+
+ // look for <, <? or <?xm in various encodings
+ if (d0 == 0 && d1 == 0 && d2 == 0 && d3 == 0x3c) return encoding_utf32_be;
+ if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
+ if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
+ if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
+ if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d) return encoding_utf8;
+
+ // look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
+ if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
+ if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
+
+ // no known BOM detected, assume utf8
+ return encoding_utf8;
+ }
+
+ PUGI__FN xml_encoding get_buffer_encoding(xml_encoding encoding, const void* contents, size_t size)
+ {
+ // replace wchar encoding with utf implementation
+ if (encoding == encoding_wchar) return get_wchar_encoding();
+
+ // replace utf16 encoding with utf16 with specific endianness
+ if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+ // replace utf32 encoding with utf32 with specific endianness
+ if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+ // only do autodetection if no explicit encoding is requested
+ if (encoding != encoding_auto) return encoding;
+
+ // skip encoding autodetection if input buffer is too small
+ if (size < 4) return encoding_utf8;
+
+ // try to guess encoding (based on XML specification, Appendix F.1)
+ const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+ PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
+
+ return guess_buffer_encoding(d0, d1, d2, d3);
+ }
+
+ PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+ {
+ if (is_mutable)
+ {
+ out_buffer = static_cast<char_t*>(const_cast<void*>(contents));
+ }
+ else
+ {
+ void* buffer = xml_memory::allocate(size > 0 ? size : 1);
+ if (!buffer) return false;
+
+ memcpy(buffer, contents, size);
+
+ out_buffer = static_cast<char_t*>(buffer);
+ }
+
+ out_length = size / sizeof(char_t);
+
+ return true;
+ }
+
+#ifdef PUGIXML_WCHAR_MODE
+ PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re)
+ {
+ return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) ||
+ (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be);
+ }
+
+ PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+ {
+ const char_t* data = static_cast<const char_t*>(contents);
+
+ if (is_mutable)
+ {
+ out_buffer = const_cast<char_t*>(data);
+ }
+ else
+ {
+ out_buffer = static_cast<char_t*>(xml_memory::allocate(size > 0 ? size : 1));
+ if (!out_buffer) return false;
+ }
+
+ out_length = size / sizeof(char_t);
+
+ convert_wchar_endian_swap(out_buffer, data, out_length);
+
+ return true;
+ }
+
+ PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+ {
+ const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+ // first pass: get length in wchar_t units
+ out_length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
+
+ // allocate buffer of suitable length
+ out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+ if (!out_buffer) return false;
+
+ // second pass: convert utf8 input to wchar_t
+ wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
+ wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, out_begin);
+
+ assert(out_end == out_begin + out_length);
+ (void)!out_end;
+
+ return true;
+ }
+
+ template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+ {
+ const uint16_t* data = static_cast<const uint16_t*>(contents);
+ size_t length = size / sizeof(uint16_t);
+
+ // first pass: get length in wchar_t units
+ out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, length, 0);
+
+ // allocate buffer of suitable length
+ out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+ if (!out_buffer) return false;
+
+ // second pass: convert utf16 input to wchar_t
+ wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
+ wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, length, out_begin);
+
+ assert(out_end == out_begin + out_length);
+ (void)!out_end;
+
+ return true;
+ }
+
+ template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+ {
+ const uint32_t* data = static_cast<const uint32_t*>(contents);
+ size_t length = size / sizeof(uint32_t);
+
+ // first pass: get length in wchar_t units
+ out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, length, 0);
+
+ // allocate buffer of suitable length
+ out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+ if (!out_buffer) return false;
+
+ // second pass: convert utf32 input to wchar_t
+ wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
+ wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, length, out_begin);
+
+ assert(out_end == out_begin + out_length);
+ (void)!out_end;
+
+ return true;
+ }
+
+ PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+ {
+ const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+ // get length in wchar_t units
+ out_length = size;
+
+ // allocate buffer of suitable length
+ out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+ if (!out_buffer) return false;
+
+ // convert latin1 input to wchar_t
+ wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
+ wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_latin1_block(data, size, out_begin);
+
+ assert(out_end == out_begin + out_length);
+ (void)!out_end;
+
+ return true;
+ }
+
+ PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+ {
+ // get native encoding
+ xml_encoding wchar_encoding = get_wchar_encoding();
+
+ // fast path: no conversion required
+ if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+ // only endian-swapping is required
+ if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
+
+ // source encoding is utf8
+ if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size);
+
+ // source encoding is utf16
+ if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+ {
+ xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+ return (native_encoding == encoding) ?
+ convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+ convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+ }
+
+ // source encoding is utf32
+ if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+ {
+ xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+ return (native_encoding == encoding) ?
+ convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+ convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+ }
+
+ // source encoding is latin1
+ if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size);
+
+ assert(!"Invalid encoding");
+ return false;
+ }
+#else
+ template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+ {
+ const uint16_t* data = static_cast<const uint16_t*>(contents);
+ size_t length = size / sizeof(uint16_t);
+
+ // first pass: get length in utf8 units
+ out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, length, 0);
+
+ // allocate buffer of suitable length
+ out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+ if (!out_buffer) return false;
+
+ // second pass: convert utf16 input to utf8
+ uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
+ uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, length, out_begin);
+
+ assert(out_end == out_begin + out_length);
+ (void)!out_end;
+
+ return true;
+ }
+
+ template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+ {
+ const uint32_t* data = static_cast<const uint32_t*>(contents);
+ size_t length = size / sizeof(uint32_t);
+
+ // first pass: get length in utf8 units
+ out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, length, 0);
+
+ // allocate buffer of suitable length
+ out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+ if (!out_buffer) return false;
+
+ // second pass: convert utf32 input to utf8
+ uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
+ uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, length, out_begin);
+
+ assert(out_end == out_begin + out_length);
+ (void)!out_end;
+
+ return true;
+ }
+
+ PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size)
+ {
+ for (size_t i = 0; i < size; ++i)
+ if (data[i] > 127)
+ return i;
+
+ return size;
+ }
+
+ PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+ {
+ const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+ // get size of prefix that does not need utf8 conversion
+ size_t prefix_length = get_latin1_7bit_prefix_length(data, size);
+ assert(prefix_length <= size);
+
+ const uint8_t* postfix = data + prefix_length;
+ size_t postfix_length = size - prefix_length;
+
+ // if no conversion is needed, just return the original buffer
+ if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+ // first pass: get length in utf8 units
+ out_length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0);
+
+ // allocate buffer of suitable length
+ out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+ if (!out_buffer) return false;
+
+ // second pass: convert latin1 input to utf8
+ memcpy(out_buffer, data, prefix_length);
+
+ uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
+ uint8_t* out_end = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, out_begin + prefix_length);
+
+ assert(out_end == out_begin + out_length);
+ (void)!out_end;
+
+ return true;
+ }
+
+ PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+ {
+ // fast path: no conversion required
+ if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+ // source encoding is utf16
+ if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+ {
+ xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+ return (native_encoding == encoding) ?
+ convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+ convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+ }
+
+ // source encoding is utf32
+ if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+ {
+ xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+ return (native_encoding == encoding) ?
+ convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+ convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+ }
+
+ // source encoding is latin1
+ if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
+
+ assert(!"Invalid encoding");
+ return false;
+ }
+#endif
+
+ PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length)
+ {
+ // get length in utf8 characters
+ return utf_decoder<utf8_counter>::decode_wchar_block(str, length, 0);
+ }
+
+ PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
+ {
+ // convert to utf8
+ uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
+ uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(str, length, begin);
+
+ assert(begin + size == end);
+ (void)!end;
+
+ // zero-terminate
+ buffer[size] = 0;
+ }
+
+#ifndef PUGIXML_NO_STL
+ PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length)
+ {
+ // first pass: get length in utf8 characters
+ size_t size = as_utf8_begin(str, length);
+
+ // allocate resulting string
+ std::string result;
+ result.resize(size);
+
+ // second pass: convert to utf8
+ if (size > 0) as_utf8_end(&result[0], size, str, length);
+
+ return result;
+ }
+
+ PUGI__FN std::basic_string<wchar_t> as_wide_impl(const char* str, size_t size)
+ {
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
+
+ // first pass: get length in wchar_t units
+ size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
+
+ // allocate resulting string
+ std::basic_string<wchar_t> result;
+ result.resize(length);
+
+ // second pass: convert to wchar_t
+ if (length > 0)
+ {
+ wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
+ wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
+
+ assert(begin + length == end);
+ (void)!end;
+ }
+
+ return result;
+ }
+#endif
+
+ inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target)
+ {
+ assert(target);
+ size_t target_length = strlength(target);
+
+ // always reuse document buffer memory if possible
+ if (!allocated) return target_length >= length;
+
+ // reuse heap memory if waste is not too great
+ const size_t reuse_threshold = 32;
+
+ return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
+ }
+
+ PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source)
+ {
+ size_t source_length = strlength(source);
+
+ if (source_length == 0)
+ {
+ // empty string and null pointer are equivalent, so just deallocate old memory
+ xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+ if (header & header_mask) alloc->deallocate_string(dest);
+
+ // mark the string as not allocated
+ dest = 0;
+ header &= ~header_mask;
+
+ return true;
+ }
+ else if (dest && strcpy_insitu_allow(source_length, header & header_mask, dest))
+ {
+ // we can reuse old buffer, so just copy the new data (including zero terminator)
+ memcpy(dest, source, (source_length + 1) * sizeof(char_t));
+
+ return true;
+ }
+ else
+ {
+ xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+ // allocate new buffer
+ char_t* buf = alloc->allocate_string(source_length + 1);
+ if (!buf) return false;
+
+ // copy the string (including zero terminator)
+ memcpy(buf, source, (source_length + 1) * sizeof(char_t));
+
+ // deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
+ if (header & header_mask) alloc->deallocate_string(dest);
+
+ // the string is now allocated, so set the flag
+ dest = buf;
+ header |= header_mask;
+
+ return true;
+ }
+ }
+
+ struct gap
+ {
+ char_t* end;
+ size_t size;
+
+ gap(): end(0), size(0)
+ {
+ }
+
+ // Push new gap, move s count bytes further (skipping the gap).
+ // Collapse previous gap.
+ void push(char_t*& s, size_t count)
+ {
+ if (end) // there was a gap already; collapse it
+ {
+ // Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
+ assert(s >= end);
+ memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+ }
+
+ s += count; // end of current gap
+
+ // "merge" two gaps
+ end = s;
+ size += count;
+ }
+
+ // Collapse all gaps, return past-the-end pointer
+ char_t* flush(char_t* s)
+ {
+ if (end)
+ {
+ // Move [old_gap_end, current_pos) to [old_gap_start, ...)
+ assert(s >= end);
+ memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+
+ return s - size;
+ }
+ else return s;
+ }
+ };
+
+ PUGI__FN char_t* strconv_escape(char_t* s, gap& g)
+ {
+ char_t* stre = s + 1;
+
+ switch (*stre)
+ {
+ case '#': // &#...
+ {
+ unsigned int ucsc = 0;
+
+ if (stre[1] == 'x') // &#x... (hex code)
+ {
+ stre += 2;
+
+ char_t ch = *stre;
+
+ if (ch == ';') return stre;
+
+ for (;;)
+ {
+ if (static_cast<unsigned int>(ch - '0') <= 9)
+ ucsc = 16 * ucsc + (ch - '0');
+ else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
+ ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
+ else if (ch == ';')
+ break;
+ else // cancel
+ return stre;
+
+ ch = *++stre;
+ }
+
+ ++stre;
+ }
+ else // &#... (dec code)
+ {
+ char_t ch = *++stre;
+
+ if (ch == ';') return stre;
+
+ for (;;)
+ {
+ if (static_cast<unsigned int>(ch - '0') <= 9)
+ ucsc = 10 * ucsc + (ch - '0');
+ else if (ch == ';')
+ break;
+ else // cancel
+ return stre;
+
+ ch = *++stre;
+ }
+
+ ++stre;
+ }
+
+ #ifdef PUGIXML_WCHAR_MODE
+ s = reinterpret_cast<char_t*>(wchar_writer::any(reinterpret_cast<wchar_writer::value_type>(s), ucsc));
+ #else
+ s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
+ #endif
+
+ g.push(s, stre - s);
+ return stre;
+ }
+
+ case 'a': // &a
+ {
+ ++stre;
+
+ if (*stre == 'm') // &am
+ {
+ if (*++stre == 'p' && *++stre == ';') // &amp;
+ {
+ *s++ = '&';
+ ++stre;
+
+ g.push(s, stre - s);
+ return stre;
+ }
+ }
+ else if (*stre == 'p') // &ap
+ {
+ if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // &apos;
+ {
+ *s++ = '\'';
+ ++stre;
+
+ g.push(s, stre - s);
+ return stre;
+ }
+ }
+ break;
+ }
+
+ case 'g': // &g
+ {
+ if (*++stre == 't' && *++stre == ';') // &gt;
+ {
+ *s++ = '>';
+ ++stre;
+
+ g.push(s, stre - s);
+ return stre;
+ }
+ break;
+ }
+
+ case 'l': // &l
+ {
+ if (*++stre == 't' && *++stre == ';') // &lt;
+ {
+ *s++ = '<';
+ ++stre;
+
+ g.push(s, stre - s);
+ return stre;
+ }
+ break;
+ }
+
+ case 'q': // &q
+ {
+ if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // &quot;
+ {
+ *s++ = '"';
+ ++stre;
+
+ g.push(s, stre - s);
+ return stre;
+ }
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ return stre;
+ }
+
+ // Utility macro for last character handling
+ #define ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e)))
+
+ PUGI__FN char_t* strconv_comment(char_t* s, char_t endch)
+ {
+ gap g;
+
+ while (true)
+ {
+ while (!PUGI__IS_CHARTYPE(*s, ct_parse_comment)) ++s;
+
+ if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+ {
+ *s++ = '\n'; // replace first one with 0x0a
+
+ if (*s == '\n') g.push(s, 1);
+ }
+ else if (s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>')) // comment ends here
+ {
+ *g.flush(s) = 0;
+
+ return s + (s[2] == '>' ? 3 : 2);
+ }
+ else if (*s == 0)
+ {
+ return 0;
+ }
+ else ++s;
+ }
+ }
+
+ PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
+ {
+ gap g;
+
+ while (true)
+ {
+ while (!PUGI__IS_CHARTYPE(*s, ct_parse_cdata)) ++s;
+
+ if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+ {
+ *s++ = '\n'; // replace first one with 0x0a
+
+ if (*s == '\n') g.push(s, 1);
+ }
+ else if (s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>')) // CDATA ends here
+ {
+ *g.flush(s) = 0;
+
+ return s + 1;
+ }
+ else if (*s == 0)
+ {
+ return 0;
+ }
+ else ++s;
+ }
+ }
+
+ typedef char_t* (*strconv_pcdata_t)(char_t*);
+
+ template <typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
+ {
+ static char_t* parse(char_t* s)
+ {
+ gap g;
+
+ while (true)
+ {
+ while (!PUGI__IS_CHARTYPE(*s, ct_parse_pcdata)) ++s;
+
+ if (*s == '<') // PCDATA ends here
+ {
+ *g.flush(s) = 0;
+
+ return s + 1;
+ }
+ else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+ {
+ *s++ = '\n'; // replace first one with 0x0a
+
+ if (*s == '\n') g.push(s, 1);
+ }
+ else if (opt_escape::value && *s == '&')
+ {
+ s = strconv_escape(s, g);
+ }
+ else if (*s == 0)
+ {
+ return s;
+ }
+ else ++s;
+ }
+ }
+ };
+
+ PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
+ {
+ PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20);
+
+ switch ((optmask >> 4) & 3) // get bitmask for flags (eol escapes)
+ {
+ case 0: return strconv_pcdata_impl<opt_false, opt_false>::parse;
+ case 1: return strconv_pcdata_impl<opt_false, opt_true>::parse;
+ case 2: return strconv_pcdata_impl<opt_true, opt_false>::parse;
+ case 3: return strconv_pcdata_impl<opt_true, opt_true>::parse;
+ default: return 0; // should not get here
+ }
+ }
+
+ typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
+
+ template <typename opt_escape> struct strconv_attribute_impl
+ {
+ static char_t* parse_wnorm(char_t* s, char_t end_quote)
+ {
+ gap g;
+
+ // trim leading whitespaces
+ if (PUGI__IS_CHARTYPE(*s, ct_space))
+ {
+ char_t* str = s;
+
+ do ++str;
+ while (PUGI__IS_CHARTYPE(*str, ct_space));
+
+ g.push(s, str - s);
+ }
+
+ while (true)
+ {
+ while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr_ws | ct_space)) ++s;
+
+ if (*s == end_quote)
+ {
+ char_t* str = g.flush(s);
+
+ do *str-- = 0;
+ while (PUGI__IS_CHARTYPE(*str, ct_space));
+
+ return s + 1;
+ }
+ else if (PUGI__IS_CHARTYPE(*s, ct_space))
+ {
+ *s++ = ' ';
+
+ if (PUGI__IS_CHARTYPE(*s, ct_space))
+ {
+ char_t* str = s + 1;
+ while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
+
+ g.push(s, str - s);
+ }
+ }
+ else if (opt_escape::value && *s == '&')
+ {
+ s = strconv_escape(s, g);
+ }
+ else if (!*s)
+ {
+ return 0;
+ }
+ else ++s;
+ }
+ }
+
+ static char_t* parse_wconv(char_t* s, char_t end_quote)
+ {
+ gap g;
+
+ while (true)
+ {
+ while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr_ws)) ++s;
+
+ if (*s == end_quote)
+ {
+ *g.flush(s) = 0;
+
+ return s + 1;
+ }
+ else if (PUGI__IS_CHARTYPE(*s, ct_space))
+ {
+ if (*s == '\r')
+ {
+ *s++ = ' ';
+
+ if (*s == '\n') g.push(s, 1);
+ }
+ else *s++ = ' ';
+ }
+ else if (opt_escape::value && *s == '&')
+ {
+ s = strconv_escape(s, g);
+ }
+ else if (!*s)
+ {
+ return 0;
+ }
+ else ++s;
+ }
+ }
+
+ static char_t* parse_eol(char_t* s, char_t end_quote)
+ {
+ gap g;
+
+ while (true)
+ {
+ while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr)) ++s;
+
+ if (*s == end_quote)
+ {
+ *g.flush(s) = 0;
+
+ return s + 1;
+ }
+ else if (*s == '\r')
+ {
+ *s++ = '\n';
+
+ if (*s == '\n') g.push(s, 1);
+ }
+ else if (opt_escape::value && *s == '&')
+ {
+ s = strconv_escape(s, g);
+ }
+ else if (!*s)
+ {
+ return 0;
+ }
+ else ++s;
+ }
+ }
+
+ static char_t* parse_simple(char_t* s, char_t end_quote)
+ {
+ gap g;
+
+ while (true)
+ {
+ while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr)) ++s;
+
+ if (*s == end_quote)
+ {
+ *g.flush(s) = 0;
+
+ return s + 1;
+ }
+ else if (opt_escape::value && *s == '&')
+ {
+ s = strconv_escape(s, g);
+ }
+ else if (!*s)
+ {
+ return 0;
+ }
+ else ++s;
+ }
+ }
+ };
+
+ PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
+ {
+ PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
+
+ switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
+ {
+ case 0: return strconv_attribute_impl<opt_false>::parse_simple;
+ case 1: return strconv_attribute_impl<opt_true>::parse_simple;
+ case 2: return strconv_attribute_impl<opt_false>::parse_eol;
+ case 3: return strconv_attribute_impl<opt_true>::parse_eol;
+ case 4: return strconv_attribute_impl<opt_false>::parse_wconv;
+ case 5: return strconv_attribute_impl<opt_true>::parse_wconv;
+ case 6: return strconv_attribute_impl<opt_false>::parse_wconv;
+ case 7: return strconv_attribute_impl<opt_true>::parse_wconv;
+ case 8: return strconv_attribute_impl<opt_false>::parse_wnorm;
+ case 9: return strconv_attribute_impl<opt_true>::parse_wnorm;
+ case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
+ case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
+ case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
+ case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
+ case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
+ case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
+ default: return 0; // should not get here
+ }
+ }
+
+ inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
+ {
+ xml_parse_result result;
+ result.status = status;
+ result.offset = offset;
+
+ return result;
+ }
+
+ struct xml_parser
+ {
+ xml_allocator alloc;
+ char_t* error_offset;
+ xml_parse_status error_status;
+
+ // Parser utilities.
+ #define PUGI__SKIPWS() { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; }
+ #define PUGI__OPTSET(OPT) ( optmsk & (OPT) )
+ #define PUGI__PUSHNODE(TYPE) { cursor = append_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
+ #define PUGI__POPNODE() { cursor = cursor->parent; }
+ #define PUGI__SCANFOR(X) { while (*s != 0 && !(X)) ++s; }
+ #define PUGI__SCANWHILE(X) { while ((X)) ++s; }
+ #define PUGI__ENDSEG() { ch = *s; *s = 0; ++s; }
+ #define PUGI__THROW_ERROR(err, m) return error_offset = m, error_status = err, static_cast<char_t*>(0)
+ #define PUGI__CHECK_ERROR(err, m) { if (*s == 0) PUGI__THROW_ERROR(err, m); }
+
+ xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
+ {
+ }
+
+ // DOCTYPE consists of nested sections of the following possible types:
+ // <!-- ... -->, <? ... ?>, "...", '...'
+ // <![...]]>
+ // <!...>
+ // First group can not contain nested groups
+ // Second group can contain nested groups of the same type
+ // Third group can contain all other groups
+ char_t* parse_doctype_primitive(char_t* s)
+ {
+ if (*s == '"' || *s == '\'')
+ {
+ // quoted string
+ char_t ch = *s++;
+ PUGI__SCANFOR(*s == ch);
+ if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+ s++;
+ }
+ else if (s[0] == '<' && s[1] == '?')
+ {
+ // <? ... ?>
+ s += 2;
+ PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
+ if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+ s += 2;
+ }
+ else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
+ {
+ s += 4;
+ PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
+ if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+ s += 4;
+ }
+ else PUGI__THROW_ERROR(status_bad_doctype, s);
+
+ return s;
+ }
+
+ char_t* parse_doctype_ignore(char_t* s)
+ {
+ assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
+ s++;
+
+ while (*s)
+ {
+ if (s[0] == '<' && s[1] == '!' && s[2] == '[')
+ {
+ // nested ignore section
+ s = parse_doctype_ignore(s);
+ if (!s) return s;
+ }
+ else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
+ {
+ // ignore section end
+ s += 3;
+
+ return s;
+ }
+ else s++;
+ }
+
+ PUGI__THROW_ERROR(status_bad_doctype, s);
+ }
+
+ char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
+ {
+ assert(s[0] == '<' && s[1] == '!');
+ s++;
+
+ while (*s)
+ {
+ if (s[0] == '<' && s[1] == '!' && s[2] != '-')
+ {
+ if (s[2] == '[')
+ {
+ // ignore
+ s = parse_doctype_ignore(s);
+ if (!s) return s;
+ }
+ else
+ {
+ // some control group
+ s = parse_doctype_group(s, endch, false);
+ if (!s) return s;
+ }
+ }
+ else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
+ {
+ // unknown tag (forbidden), or some primitive group
+ s = parse_doctype_primitive(s);
+ if (!s) return s;
+ }
+ else if (*s == '>')
+ {
+ s++;
+
+ return s;
+ }
+ else s++;
+ }
+
+ if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
+
+ return s;
+ }
+
+ char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
+ {
+ // parse node contents, starting with exclamation mark
+ ++s;
+
+ if (*s == '-') // '<!-...'
+ {
+ ++s;
+
+ if (*s == '-') // '<!--...'
+ {
+ ++s;
+
+ if (PUGI__OPTSET(parse_comments))
+ {
+ PUGI__PUSHNODE(node_comment); // Append a new node on the tree.
+ cursor->value = s; // Save the offset.
+ }
+
+ if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments))
+ {
+ s = strconv_comment(s, endch);
+
+ if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value);
+ }
+ else
+ {
+ // Scan for terminating '-->'.
+ PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>'));
+ PUGI__CHECK_ERROR(status_bad_comment, s);
+
+ if (PUGI__OPTSET(parse_comments))
+ *s = 0; // Zero-terminate this segment at the first terminating '-'.
+
+ s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
+ }
+ }
+ else PUGI__THROW_ERROR(status_bad_comment, s);
+ }
+ else if (*s == '[')
+ {
+ // '<![CDATA[...'
+ if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
+ {
+ ++s;
+
+ if (PUGI__OPTSET(parse_cdata))
+ {
+ PUGI__PUSHNODE(node_cdata); // Append a new node on the tree.
+ cursor->value = s; // Save the offset.
+
+ if (PUGI__OPTSET(parse_eol))
+ {
+ s = strconv_cdata(s, endch);
+
+ if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value);
+ }
+ else
+ {
+ // Scan for terminating ']]>'.
+ PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
+ PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+ *s++ = 0; // Zero-terminate this segment.
+ }
+ }
+ else // Flagged for discard, but we still have to scan for the terminator.
+ {
+ // Scan for terminating ']]>'.
+ PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
+ PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+ ++s;
+ }
+
+ s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
+ }
+ else PUGI__THROW_ERROR(status_bad_cdata, s);
+ }
+ else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && ENDSWITH(s[6], 'E'))
+ {
+ s -= 2;
+
+ if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+ char_t* mark = s + 9;
+
+ s = parse_doctype_group(s, endch, true);
+ if (!s) return s;
+
+ if (PUGI__OPTSET(parse_doctype))
+ {
+ while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
+
+ PUGI__PUSHNODE(node_doctype);
+
+ cursor->value = mark;
+
+ assert((s[0] == 0 && endch == '>') || s[-1] == '>');
+ s[*s == 0 ? 0 : -1] = 0;
+
+ PUGI__POPNODE();
+ }
+ }
+ else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s);
+ else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s);
+ else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+
+ return s;
+ }
+
+ char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
+ {
+ // load into registers
+ xml_node_struct* cursor = ref_cursor;
+ char_t ch = 0;
+
+ // parse node contents, starting with question mark
+ ++s;
+
+ // read PI target
+ char_t* target = s;
+
+ if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s);
+
+ PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol));
+ PUGI__CHECK_ERROR(status_bad_pi, s);
+
+ // determine node type; stricmp / strcasecmp is not portable
+ bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
+
+ if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi))
+ {
+ if (declaration)
+ {
+ // disallow non top-level declarations
+ if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s);
+
+ PUGI__PUSHNODE(node_declaration);
+ }
+ else
+ {
+ PUGI__PUSHNODE(node_pi);
+ }
+
+ cursor->name = target;
+
+ PUGI__ENDSEG();
+
+ // parse value/attributes
+ if (ch == '?')
+ {
+ // empty node
+ if (!ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s);
+ s += (*s == '>');
+
+ PUGI__POPNODE();
+ }
+ else if (PUGI__IS_CHARTYPE(ch, ct_space))
+ {
+ PUGI__SKIPWS();
+
+ // scan for tag end
+ char_t* value = s;
+
+ PUGI__SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
+ PUGI__CHECK_ERROR(status_bad_pi, s);
+
+ if (declaration)
+ {
+ // replace ending ? with / so that 'element' terminates properly
+ *s = '/';
+
+ // we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
+ s = value;
+ }
+ else
+ {
+ // store value and step over >
+ cursor->value = value;
+ PUGI__POPNODE();
+
+ PUGI__ENDSEG();
+
+ s += (*s == '>');
+ }
+ }
+ else PUGI__THROW_ERROR(status_bad_pi, s);
+ }
+ else
+ {
+ // scan for tag end
+ PUGI__SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
+ PUGI__CHECK_ERROR(status_bad_pi, s);
+
+ s += (s[1] == '>' ? 2 : 1);
+ }
+
+ // store from registers
+ ref_cursor = cursor;
+
+ return s;
+ }
+
+ char_t* parse(char_t* s, xml_node_struct* xmldoc, unsigned int optmsk, char_t endch)
+ {
+ strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
+ strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
+
+ char_t ch = 0;
+ xml_node_struct* cursor = xmldoc;
+ char_t* mark = s;
+
+ while (*s != 0)
+ {
+ if (*s == '<')
+ {
+ ++s;
+
+ LOC_TAG:
+ if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
+ {
+ PUGI__PUSHNODE(node_element); // Append a new node to the tree.
+
+ cursor->name = s;
+
+ PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
+ PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+
+ if (ch == '>')
+ {
+ // end of tag
+ }
+ else if (PUGI__IS_CHARTYPE(ch, ct_space))
+ {
+ LOC_ATTRIBUTES:
+ while (true)
+ {
+ PUGI__SKIPWS(); // Eat any whitespace.
+
+ if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
+ {
+ xml_attribute_struct* a = append_attribute_ll(cursor, alloc); // Make space for this attribute.
+ if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
+
+ a->name = s; // Save the offset.
+
+ PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
+ PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
+
+ PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+ PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
+
+ if (PUGI__IS_CHARTYPE(ch, ct_space))
+ {
+ PUGI__SKIPWS(); // Eat any whitespace.
+ PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
+
+ ch = *s;
+ ++s;
+ }
+
+ if (ch == '=') // '<... #=...'
+ {
+ PUGI__SKIPWS(); // Eat any whitespace.
+
+ if (*s == '"' || *s == '\'') // '<... #="...'
+ {
+ ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
+ ++s; // Step over the quote.
+ a->value = s; // Save the offset.
+
+ s = strconv_attribute(s, ch);
+
+ if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
+
+ // After this line the loop continues from the start;
+ // Whitespaces, / and > are ok, symbols and EOF are wrong,
+ // everything else will be detected
+ if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s);
+ }
+ else PUGI__THROW_ERROR(status_bad_attribute, s);
+ }
+ else PUGI__THROW_ERROR(status_bad_attribute, s);
+ }
+ else if (*s == '/')
+ {
+ ++s;
+
+ if (*s == '>')
+ {
+ PUGI__POPNODE();
+ s++;
+ break;
+ }
+ else if (*s == 0 && endch == '>')
+ {
+ PUGI__POPNODE();
+ break;
+ }
+ else PUGI__THROW_ERROR(status_bad_start_element, s);
+ }
+ else if (*s == '>')
+ {
+ ++s;
+
+ break;
+ }
+ else if (*s == 0 && endch == '>')
+ {
+ break;
+ }
+ else PUGI__THROW_ERROR(status_bad_start_element, s);
+ }
+
+ // !!!
+ }
+ else if (ch == '/') // '<#.../'
+ {
+ if (!ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s);
+
+ PUGI__POPNODE(); // Pop.
+
+ s += (*s == '>');
+ }
+ else if (ch == 0)
+ {
+ // we stepped over null terminator, backtrack & handle closing tag
+ --s;
+
+ if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
+ }
+ else PUGI__THROW_ERROR(status_bad_start_element, s);
+ }
+ else if (*s == '/')
+ {
+ ++s;
+
+ char_t* name = cursor->name;
+ if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+
+ while (PUGI__IS_CHARTYPE(*s, ct_symbol))
+ {
+ if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+ }
+
+ if (*name)
+ {
+ if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
+ else PUGI__THROW_ERROR(status_end_element_mismatch, s);
+ }
+
+ PUGI__POPNODE(); // Pop.
+
+ PUGI__SKIPWS();
+
+ if (*s == 0)
+ {
+ if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+ }
+ else
+ {
+ if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+ ++s;
+ }
+ }
+ else if (*s == '?') // '<?...'
+ {
+ s = parse_question(s, cursor, optmsk, endch);
+ if (!s) return s;
+
+ assert(cursor);
+ if ((cursor->header & xml_memory_page_type_mask) + 1 == node_declaration) goto LOC_ATTRIBUTES;
+ }
+ else if (*s == '!') // '<!...'
+ {
+ s = parse_exclamation(s, cursor, optmsk, endch);
+ if (!s) return s;
+ }
+ else if (*s == 0 && endch == '?') PUGI__THROW_ERROR(status_bad_pi, s);
+ else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+ }
+ else
+ {
+ mark = s; // Save this offset while searching for a terminator.
+
+ PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
+
+ if (*s == '<')
+ {
+ // We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
+ assert(mark != s);
+
+ if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single))
+ {
+ continue;
+ }
+ else if (PUGI__OPTSET(parse_ws_pcdata_single))
+ {
+ if (s[1] != '/' || cursor->first_child) continue;
+ }
+ }
+
+ s = mark;
+
+ if (cursor->parent)
+ {
+ PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
+ cursor->value = s; // Save the offset.
+
+ s = strconv_pcdata(s);
+
+ PUGI__POPNODE(); // Pop since this is a standalone.
+
+ if (!*s) break;
+ }
+ else
+ {
+ PUGI__SCANFOR(*s == '<'); // '...<'
+ if (!*s) break;
+
+ ++s;
+ }
+
+ // We're after '<'
+ goto LOC_TAG;
+ }
+ }
+
+ // check that last tag is closed
+ if (cursor != xmldoc) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+
+ return s;
+ }
+
+ static xml_parse_result parse(char_t* buffer, size_t length, xml_node_struct* root, unsigned int optmsk)
+ {
+ xml_document_struct* xmldoc = static_cast<xml_document_struct*>(root);
+
+ // store buffer for offset_debug
+ xmldoc->buffer = buffer;
+
+ // early-out for empty documents
+ if (length == 0) return make_parse_result(status_ok);
+
+ // create parser on stack
+ xml_parser parser(*xmldoc);
+
+ // save last character and make buffer zero-terminated (speeds up parsing)
+ char_t endch = buffer[length - 1];
+ buffer[length - 1] = 0;
+
+ // perform actual parsing
+ parser.parse(buffer, xmldoc, optmsk, endch);
+
+ xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
+ assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
+
+ // update allocator state
+ *static_cast<xml_allocator*>(xmldoc) = parser.alloc;
+
+ // since we removed last character, we have to handle the only possible false positive
+ if (result && endch == '<')
+ {
+ // there's no possible well-formed document with < at the end
+ return make_parse_result(status_unrecognized_tag, length);
+ }
+
+ return result;
+ }
+ };
+
+ // Output facilities
+ PUGI__FN xml_encoding get_write_native_encoding()
+ {
+ #ifdef PUGIXML_WCHAR_MODE
+ return get_wchar_encoding();
+ #else
+ return encoding_utf8;
+ #endif
+ }
+
+ PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding)
+ {
+ // replace wchar encoding with utf implementation
+ if (encoding == encoding_wchar) return get_wchar_encoding();
+
+ // replace utf16 encoding with utf16 with specific endianness
+ if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+ // replace utf32 encoding with utf32 with specific endianness
+ if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+ // only do autodetection if no explicit encoding is requested
+ if (encoding != encoding_auto) return encoding;
+
+ // assume utf8 encoding
+ return encoding_utf8;
+ }
+
+#ifdef PUGIXML_WCHAR_MODE
+ PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+ {
+ assert(length > 0);
+
+ // discard last character if it's the lead of a surrogate pair
+ return (sizeof(wchar_t) == 2 && static_cast<unsigned int>(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
+ }
+
+ PUGI__FN size_t convert_buffer(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+ {
+ // only endian-swapping is required
+ if (need_endian_swap_utf(encoding, get_wchar_encoding()))
+ {
+ convert_wchar_endian_swap(r_char, data, length);
+
+ return length * sizeof(char_t);
+ }
+
+ // convert to utf8
+ if (encoding == encoding_utf8)
+ {
+ uint8_t* dest = r_u8;
+ uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(data, length, dest);
+
+ return static_cast<size_t>(end - dest);
+ }
+
+ // convert to utf16
+ if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+ {
+ uint16_t* dest = r_u16;
+
+ // convert to native utf16
+ uint16_t* end = utf_decoder<utf16_writer>::decode_wchar_block(data, length, dest);
+
+ // swap if necessary
+ xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+ if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+ return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+ }
+
+ // convert to utf32
+ if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+ {
+ uint32_t* dest = r_u32;
+
+ // convert to native utf32
+ uint32_t* end = utf_decoder<utf32_writer>::decode_wchar_block(data, length, dest);
+
+ // swap if necessary
+ xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+ if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+ return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+ }
+
+ // convert to latin1
+ if (encoding == encoding_latin1)
+ {
+ uint8_t* dest = r_u8;
+ uint8_t* end = utf_decoder<latin1_writer>::decode_wchar_block(data, length, dest);
+
+ return static_cast<size_t>(end - dest);
+ }
+
+ assert(!"Invalid encoding");
+ return 0;
+ }
+#else
+ PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+ {
+ assert(length > 4);
+
+ for (size_t i = 1; i <= 4; ++i)
+ {
+ uint8_t ch = static_cast<uint8_t>(data[length - i]);
+
+ // either a standalone character or a leading one
+ if ((ch & 0xc0) != 0x80) return length - i;
+ }
+
+ // there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk
+ return length;
+ }
+
+ PUGI__FN size_t convert_buffer(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+ {
+ if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+ {
+ uint16_t* dest = r_u16;
+
+ // convert to native utf16
+ uint16_t* end = utf_decoder<utf16_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+ // swap if necessary
+ xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+ if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+ return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+ }
+
+ if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+ {
+ uint32_t* dest = r_u32;
+
+ // convert to native utf32
+ uint32_t* end = utf_decoder<utf32_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+ // swap if necessary
+ xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+ if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+ return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+ }
+
+ if (encoding == encoding_latin1)
+ {
+ uint8_t* dest = r_u8;
+ uint8_t* end = utf_decoder<latin1_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+ return static_cast<size_t>(end - dest);
+ }
+
+ assert(!"Invalid encoding");
+ return 0;
+ }
+#endif
+
+ class xml_buffered_writer
+ {
+ xml_buffered_writer(const xml_buffered_writer&);
+ xml_buffered_writer& operator=(const xml_buffered_writer&);
+
+ public:
+ xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding))
+ {
+ PUGI__STATIC_ASSERT(bufcapacity >= 8);
+ }
+
+ ~xml_buffered_writer()
+ {
+ flush();
+ }
+
+ void flush()
+ {
+ flush(buffer, bufsize);
+ bufsize = 0;
+ }
+
+ void flush(const char_t* data, size_t size)
+ {
+ if (size == 0) return;
+
+ // fast path, just write data
+ if (encoding == get_write_native_encoding())
+ writer.write(data, size * sizeof(char_t));
+ else
+ {
+ // convert chunk
+ size_t result = convert_buffer(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding);
+ assert(result <= sizeof(scratch));
+
+ // write data
+ writer.write(scratch.data_u8, result);
+ }
+ }
+
+ void write(const char_t* data, size_t length)
+ {
+ if (bufsize + length > bufcapacity)
+ {
+ // flush the remaining buffer contents
+ flush();
+
+ // handle large chunks
+ if (length > bufcapacity)
+ {
+ if (encoding == get_write_native_encoding())
+ {
+ // fast path, can just write data chunk
+ writer.write(data, length * sizeof(char_t));
+ return;
+ }
+
+ // need to convert in suitable chunks
+ while (length > bufcapacity)
+ {
+ // get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer
+ // and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary)
+ size_t chunk_size = get_valid_length(data, bufcapacity);
+
+ // convert chunk and write
+ flush(data, chunk_size);
+
+ // iterate
+ data += chunk_size;
+ length -= chunk_size;
+ }
+
+ // small tail is copied below
+ bufsize = 0;
+ }
+ }
+
+ memcpy(buffer + bufsize, data, length * sizeof(char_t));
+ bufsize += length;
+ }
+
+ void write(const char_t* data)
+ {
+ write(data, strlength(data));
+ }
+
+ void write(char_t d0)
+ {
+ if (bufsize + 1 > bufcapacity) flush();
+
+ buffer[bufsize + 0] = d0;
+ bufsize += 1;
+ }
+
+ void write(char_t d0, char_t d1)
+ {
+ if (bufsize + 2 > bufcapacity) flush();
+
+ buffer[bufsize + 0] = d0;
+ buffer[bufsize + 1] = d1;
+ bufsize += 2;
+ }
+
+ void write(char_t d0, char_t d1, char_t d2)
+ {
+ if (bufsize + 3 > bufcapacity) flush();
+
+ buffer[bufsize + 0] = d0;
+ buffer[bufsize + 1] = d1;
+ buffer[bufsize + 2] = d2;
+ bufsize += 3;
+ }
+
+ void write(char_t d0, char_t d1, char_t d2, char_t d3)
+ {
+ if (bufsize + 4 > bufcapacity) flush();
+
+ buffer[bufsize + 0] = d0;
+ buffer[bufsize + 1] = d1;
+ buffer[bufsize + 2] = d2;
+ buffer[bufsize + 3] = d3;
+ bufsize += 4;
+ }
+
+ void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4)
+ {
+ if (bufsize + 5 > bufcapacity) flush();
+
+ buffer[bufsize + 0] = d0;
+ buffer[bufsize + 1] = d1;
+ buffer[bufsize + 2] = d2;
+ buffer[bufsize + 3] = d3;
+ buffer[bufsize + 4] = d4;
+ bufsize += 5;
+ }
+
+ void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5)
+ {
+ if (bufsize + 6 > bufcapacity) flush();
+
+ buffer[bufsize + 0] = d0;
+ buffer[bufsize + 1] = d1;
+ buffer[bufsize + 2] = d2;
+ buffer[bufsize + 3] = d3;
+ buffer[bufsize + 4] = d4;
+ buffer[bufsize + 5] = d5;
+ bufsize += 6;
+ }
+
+ // utf8 maximum expansion: x4 (-> utf32)
+ // utf16 maximum expansion: x2 (-> utf32)
+ // utf32 maximum expansion: x1
+ enum
+ {
+ bufcapacitybytes =
+ #ifdef PUGIXML_MEMORY_OUTPUT_STACK
+ PUGIXML_MEMORY_OUTPUT_STACK
+ #else
+ 10240
+ #endif
+ ,
+ bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4)
+ };
+
+ char_t buffer[bufcapacity];
+
+ union
+ {
+ uint8_t data_u8[4 * bufcapacity];
+ uint16_t data_u16[2 * bufcapacity];
+ uint32_t data_u32[bufcapacity];
+ char_t data_char[bufcapacity];
+ } scratch;
+
+ xml_writer& writer;
+ size_t bufsize;
+ xml_encoding encoding;
+ };
+
+ PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type)
+ {
+ while (*s)
+ {
+ const char_t* prev = s;
+
+ // While *s is a usual symbol
+ while (!PUGI__IS_CHARTYPEX(*s, type)) ++s;
+
+ writer.write(prev, static_cast<size_t>(s - prev));
+
+ switch (*s)
+ {
+ case 0: break;
+ case '&':
+ writer.write('&', 'a', 'm', 'p', ';');
+ ++s;
+ break;
+ case '<':
+ writer.write('&', 'l', 't', ';');
+ ++s;
+ break;
+ case '>':
+ writer.write('&', 'g', 't', ';');
+ ++s;
+ break;
+ case '"':
+ writer.write('&', 'q', 'u', 'o', 't', ';');
+ ++s;
+ break;
+ default: // s is not a usual symbol
+ {
+ unsigned int ch = static_cast<unsigned int>(*s++);
+ assert(ch < 32);
+
+ writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
+ }
+ }
+ }
+ }
+
+ PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags)
+ {
+ if (flags & format_no_escapes)
+ writer.write(s);
+ else
+ text_output_escaped(writer, s, type);
+ }
+
+ PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s)
+ {
+ do
+ {
+ writer.write('<', '!', '[', 'C', 'D');
+ writer.write('A', 'T', 'A', '[');
+
+ const char_t* prev = s;
+
+ // look for ]]> sequence - we can't output it as is since it terminates CDATA
+ while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s;
+
+ // skip ]] if we stopped at ]]>, > will go to the next CDATA section
+ if (*s) s += 2;
+
+ writer.write(prev, static_cast<size_t>(s - prev));
+
+ writer.write(']', ']', '>');
+ }
+ while (*s);
+ }
+
+ PUGI__FN void node_output_attributes(xml_buffered_writer& writer, const xml_node& node, unsigned int flags)
+ {
+ const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+ for (xml_attribute a = node.first_attribute(); a; a = a.next_attribute())
+ {
+ writer.write(' ');
+ writer.write(a.name()[0] ? a.name() : default_name);
+ writer.write('=', '"');
+
+ text_output(writer, a.value(), ctx_special_attr, flags);
+
+ writer.write('"');
+ }
+ }
+
+ PUGI__FN void node_output(xml_buffered_writer& writer, const xml_node& node, const char_t* indent, unsigned int flags, unsigned int depth)
+ {
+ const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+ if ((flags & format_indent) != 0 && (flags & format_raw) == 0)
+ for (unsigned int i = 0; i < depth; ++i) writer.write(indent);
+
+ switch (node.type())
+ {
+ case node_document:
+ {
+ for (xml_node n = node.first_child(); n; n = n.next_sibling())
+ node_output(writer, n, indent, flags, depth);
+ break;
+ }
+
+ case node_element:
+ {
+ const char_t* name = node.name()[0] ? node.name() : default_name;
+
+ writer.write('<');
+ writer.write(name);
+
+ node_output_attributes(writer, node, flags);
+
+ if (flags & format_raw)
+ {
+ if (!node.first_child())
+ writer.write(' ', '/', '>');
+ else
+ {
+ writer.write('>');
+
+ for (xml_node n = node.first_child(); n; n = n.next_sibling())
+ node_output(writer, n, indent, flags, depth + 1);
+
+ writer.write('<', '/');
+ writer.write(name);
+ writer.write('>');
+ }
+ }
+ else if (!node.first_child())
+ writer.write(' ', '/', '>', '\n');
+ else if (node.first_child() == node.last_child() && (node.first_child().type() == node_pcdata || node.first_child().type() == node_cdata))
+ {
+ writer.write('>');
+
+ if (node.first_child().type() == node_pcdata)
+ text_output(writer, node.first_child().value(), ctx_special_pcdata, flags);
+ else
+ text_output_cdata(writer, node.first_child().value());
+
+ writer.write('<', '/');
+ writer.write(name);
+ writer.write('>', '\n');
+ }
+ else
+ {
+ writer.write('>', '\n');
+
+ for (xml_node n = node.first_child(); n; n = n.next_sibling())
+ node_output(writer, n, indent, flags, depth + 1);
+
+ if ((flags & format_indent) != 0 && (flags & format_raw) == 0)
+ for (unsigned int i = 0; i < depth; ++i) writer.write(indent);
+
+ writer.write('<', '/');
+ writer.write(name);
+ writer.write('>', '\n');
+ }
+
+ break;
+ }
+
+ case node_pcdata:
+ text_output(writer, node.value(), ctx_special_pcdata, flags);
+ if ((flags & format_raw) == 0) writer.write('\n');
+ break;
+
+ case node_cdata:
+ text_output_cdata(writer, node.value());
+ if ((flags & format_raw) == 0) writer.write('\n');
+ break;
+
+ case node_comment:
+ writer.write('<', '!', '-', '-');
+ writer.write(node.value());
+ writer.write('-', '-', '>');
+ if ((flags & format_raw) == 0) writer.write('\n');
+ break;
+
+ case node_pi:
+ case node_declaration:
+ writer.write('<', '?');
+ writer.write(node.name()[0] ? node.name() : default_name);
+
+ if (node.type() == node_declaration)
+ {
+ node_output_attributes(writer, node, flags);
+ }
+ else if (node.value()[0])
+ {
+ writer.write(' ');
+ writer.write(node.value());
+ }
+
+ writer.write('?', '>');
+ if ((flags & format_raw) == 0) writer.write('\n');
+ break;
+
+ case node_doctype:
+ writer.write('<', '!', 'D', 'O', 'C');
+ writer.write('T', 'Y', 'P', 'E');
+
+ if (node.value()[0])
+ {
+ writer.write(' ');
+ writer.write(node.value());
+ }
+
+ writer.write('>');
+ if ((flags & format_raw) == 0) writer.write('\n');
+ break;
+
+ default:
+ assert(!"Invalid node type");
+ }
+ }
+
+ inline bool has_declaration(const xml_node& node)
+ {
+ for (xml_node child = node.first_child(); child; child = child.next_sibling())
+ {
+ xml_node_type type = child.type();
+
+ if (type == node_declaration) return true;
+ if (type == node_element) return false;
+ }
+
+ return false;
+ }
+
+ inline bool allow_insert_child(xml_node_type parent, xml_node_type child)
+ {
+ if (parent != node_document && parent != node_element) return false;
+ if (child == node_document || child == node_null) return false;
+ if (parent != node_document && (child == node_declaration || child == node_doctype)) return false;
+
+ return true;
+ }
+
+ PUGI__FN void recursive_copy_skip(xml_node& dest, const xml_node& source, const xml_node& skip)
+ {
+ assert(dest.type() == source.type());
+
+ switch (source.type())
+ {
+ case node_element:
+ {
+ dest.set_name(source.name());
+
+ for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute())
+ dest.append_attribute(a.name()).set_value(a.value());
+
+ for (xml_node c = source.first_child(); c; c = c.next_sibling())
+ {
+ if (c == skip) continue;
+
+ xml_node cc = dest.append_child(c.type());
+ assert(cc);
+
+ recursive_copy_skip(cc, c, skip);
+ }
+
+ break;
+ }
+
+ case node_pcdata:
+ case node_cdata:
+ case node_comment:
+ case node_doctype:
+ dest.set_value(source.value());
+ break;
+
+ case node_pi:
+ dest.set_name(source.name());
+ dest.set_value(source.value());
+ break;
+
+ case node_declaration:
+ {
+ dest.set_name(source.name());
+
+ for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute())
+ dest.append_attribute(a.name()).set_value(a.value());
+
+ break;
+ }
+
+ default:
+ assert(!"Invalid node type");
+ }
+ }
+
+ inline bool is_text_node(xml_node_struct* node)
+ {
+ xml_node_type type = static_cast<xml_node_type>((node->header & impl::xml_memory_page_type_mask) + 1);
+
+ return type == node_pcdata || type == node_cdata;
+ }
+
+ // get value with conversion functions
+ PUGI__FN int get_value_int(const char_t* value, int def)
+ {
+ if (!value) return def;
+
+ #ifdef PUGIXML_WCHAR_MODE
+ return static_cast<int>(wcstol(value, 0, 10));
+ #else
+ return static_cast<int>(strtol(value, 0, 10));
+ #endif
+ }
+
+ PUGI__FN unsigned int get_value_uint(const char_t* value, unsigned int def)
+ {
+ if (!value) return def;
+
+ #ifdef PUGIXML_WCHAR_MODE
+ return static_cast<unsigned int>(wcstoul(value, 0, 10));
+ #else
+ return static_cast<unsigned int>(strtoul(value, 0, 10));
+ #endif
+ }
+
+ PUGI__FN double get_value_double(const char_t* value, double def)
+ {
+ if (!value) return def;
+
+ #ifdef PUGIXML_WCHAR_MODE
+ return wcstod(value, 0);
+ #else
+ return strtod(value, 0);
+ #endif
+ }
+
+ PUGI__FN float get_value_float(const char_t* value, float def)
+ {
+ if (!value) return def;
+
+ #ifdef PUGIXML_WCHAR_MODE
+ return static_cast<float>(wcstod(value, 0));
+ #else
+ return static_cast<float>(strtod(value, 0));
+ #endif
+ }
+
+ PUGI__FN bool get_value_bool(const char_t* value, bool def)
+ {
+ if (!value) return def;
+
+ // only look at first char
+ char_t first = *value;
+
+ // 1*, t* (true), T* (True), y* (yes), Y* (YES)
+ return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y');
+ }
+
+ // set value with conversion functions
+ PUGI__FN bool set_value_buffer(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char (&buf)[128])
+ {
+ #ifdef PUGIXML_WCHAR_MODE
+ char_t wbuf[128];
+ impl::widen_ascii(wbuf, buf);
+
+ return strcpy_insitu(dest, header, header_mask, wbuf);
+ #else
+ return strcpy_insitu(dest, header, header_mask, buf);
+ #endif
+ }
+
+ PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, int value)
+ {
+ char buf[128];
+ sprintf(buf, "%d", value);
+
+ return set_value_buffer(dest, header, header_mask, buf);
+ }
+
+ PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned int value)
+ {
+ char buf[128];
+ sprintf(buf, "%u", value);
+
+ return set_value_buffer(dest, header, header_mask, buf);
+ }
+
+ PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, double value)
+ {
+ char buf[128];
+ sprintf(buf, "%g", value);
+
+ return set_value_buffer(dest, header, header_mask, buf);
+ }
+
+ PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, bool value)
+ {
+ return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+ }
+
+ // we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick
+ PUGI__FN xml_parse_status get_file_size(FILE* file, size_t& out_result)
+ {
+ #if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+ // there are 64-bit versions of fseek/ftell, let's use them
+ typedef __int64 length_type;
+
+ _fseeki64(file, 0, SEEK_END);
+ length_type length = _ftelli64(file);
+ _fseeki64(file, 0, SEEK_SET);
+ #elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && !defined(__STRICT_ANSI__)
+ // there are 64-bit versions of fseek/ftell, let's use them
+ typedef off64_t length_type;
+
+ fseeko64(file, 0, SEEK_END);
+ length_type length = ftello64(file);
+ fseeko64(file, 0, SEEK_SET);
+ #else
+ // if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway.
+ typedef long length_type;
+
+ fseek(file, 0, SEEK_END);
+ length_type length = ftell(file);
+ fseek(file, 0, SEEK_SET);
+ #endif
+
+ // check for I/O errors
+ if (length < 0) return status_io_error;
+
+ // check for overflow
+ size_t result = static_cast<size_t>(length);
+
+ if (static_cast<length_type>(result) != length) return status_out_of_memory;
+
+ // finalize
+ out_result = result;
+
+ return status_ok;
+ }
+
+ PUGI__FN xml_parse_result load_file_impl(xml_document& doc, FILE* file, unsigned int options, xml_encoding encoding)
+ {
+ if (!file) return make_parse_result(status_file_not_found);
+
+ // get file size (can result in I/O errors)
+ size_t size = 0;
+ xml_parse_status size_status = get_file_size(file, size);
+
+ if (size_status != status_ok)
+ {
+ fclose(file);
+ return make_parse_result(size_status);
+ }
+
+ // allocate buffer for the whole file
+ char* contents = static_cast<char*>(xml_memory::allocate(size > 0 ? size : 1));
+
+ if (!contents)
+ {
+ fclose(file);
+ return make_parse_result(status_out_of_memory);
+ }
+
+ // read file in memory
+ size_t read_size = fread(contents, 1, size, file);
+ fclose(file);
+
+ if (read_size != size)
+ {
+ xml_memory::deallocate(contents);
+ return make_parse_result(status_io_error);
+ }
+
+ return doc.load_buffer_inplace_own(contents, size, options, encoding);
+ }
+
+#ifndef PUGIXML_NO_STL
+ template <typename T> struct xml_stream_chunk
+ {
+ static xml_stream_chunk* create()
+ {
+ void* memory = xml_memory::allocate(sizeof(xml_stream_chunk));
+
+ return new (memory) xml_stream_chunk();
+ }
+
+ static void destroy(void* ptr)
+ {
+ xml_stream_chunk* chunk = static_cast<xml_stream_chunk*>(ptr);
+
+ // free chunk chain
+ while (chunk)
+ {
+ xml_stream_chunk* next = chunk->next;
+ xml_memory::deallocate(chunk);
+ chunk = next;
+ }
+ }
+
+ xml_stream_chunk(): next(0), size(0)
+ {
+ }
+
+ xml_stream_chunk* next;
+ size_t size;
+
+ T data[xml_memory_page_size / sizeof(T)];
+ };
+
+ template <typename T> PUGI__FN xml_parse_status load_stream_data_noseek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+ {
+ buffer_holder chunks(0, xml_stream_chunk<T>::destroy);
+
+ // read file to a chunk list
+ size_t total = 0;
+ xml_stream_chunk<T>* last = 0;
+
+ while (!stream.eof())
+ {
+ // allocate new chunk
+ xml_stream_chunk<T>* chunk = xml_stream_chunk<T>::create();
+ if (!chunk) return status_out_of_memory;
+
+ // append chunk to list
+ if (last) last = last->next = chunk;
+ else chunks.data = last = chunk;
+
+ // read data to chunk
+ stream.read(chunk->data, static_cast<std::streamsize>(sizeof(chunk->data) / sizeof(T)));
+ chunk->size = static_cast<size_t>(stream.gcount()) * sizeof(T);
+
+ // read may set failbit | eofbit in case gcount() is less than read length, so check for other I/O errors
+ if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+ // guard against huge files (chunk size is small enough to make this overflow check work)
+ if (total + chunk->size < total) return status_out_of_memory;
+ total += chunk->size;
+ }
+
+ // copy chunk list to a contiguous buffer
+ char* buffer = static_cast<char*>(xml_memory::allocate(total));
+ if (!buffer) return status_out_of_memory;
+
+ char* write = buffer;
+
+ for (xml_stream_chunk<T>* chunk = static_cast<xml_stream_chunk<T>*>(chunks.data); chunk; chunk = chunk->next)
+ {
+ assert(write + chunk->size <= buffer + total);
+ memcpy(write, chunk->data, chunk->size);
+ write += chunk->size;
+ }
+
+ assert(write == buffer + total);
+
+ // return buffer
+ *out_buffer = buffer;
+ *out_size = total;
+
+ return status_ok;
+ }
+
+ template <typename T> PUGI__FN xml_parse_status load_stream_data_seek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+ {
+ // get length of remaining data in stream
+ typename std::basic_istream<T>::pos_type pos = stream.tellg();
+ stream.seekg(0, std::ios::end);
+ std::streamoff length = stream.tellg() - pos;
+ stream.seekg(pos);
+
+ if (stream.fail() || pos < 0) return status_io_error;
+
+ // guard against huge files
+ size_t read_length = static_cast<size_t>(length);
+
+ if (static_cast<std::streamsize>(read_length) != length || length < 0) return status_out_of_memory;
+
+ // read stream data into memory (guard against stream exceptions with buffer holder)
+ buffer_holder buffer(xml_memory::allocate((read_length > 0 ? read_length : 1) * sizeof(T)), xml_memory::deallocate);
+ if (!buffer.data) return status_out_of_memory;
+
+ stream.read(static_cast<T*>(buffer.data), static_cast<std::streamsize>(read_length));
+
+ // read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors
+ if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+ // return buffer
+ size_t actual_length = static_cast<size_t>(stream.gcount());
+ assert(actual_length <= read_length);
+
+ *out_buffer = buffer.release();
+ *out_size = actual_length * sizeof(T);
+
+ return status_ok;
+ }
+
+ template <typename T> PUGI__FN xml_parse_result load_stream_impl(xml_document& doc, std::basic_istream<T>& stream, unsigned int options, xml_encoding encoding)
+ {
+ void* buffer = 0;
+ size_t size = 0;
+
+ // load stream to memory (using seek-based implementation if possible, since it's faster and takes less memory)
+ xml_parse_status status = (stream.tellg() < 0) ? load_stream_data_noseek(stream, &buffer, &size) : load_stream_data_seek(stream, &buffer, &size);
+ if (status != status_ok) return make_parse_result(status);
+
+ return doc.load_buffer_inplace_own(buffer, size, options, encoding);
+ }
+#endif
+
+#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) || (defined(__MINGW32__) && !defined(__STRICT_ANSI__))
+ PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+ {
+ return _wfopen(path, mode);
+ }
+#else
+ PUGI__FN char* convert_path_heap(const wchar_t* str)
+ {
+ assert(str);
+
+ // first pass: get length in utf8 characters
+ size_t length = wcslen(str);
+ size_t size = as_utf8_begin(str, length);
+
+ // allocate resulting string
+ char* result = static_cast<char*>(xml_memory::allocate(size + 1));
+ if (!result) return 0;
+
+ // second pass: convert to utf8
+ as_utf8_end(result, size, str, length);
+
+ return result;
+ }
+
+ PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+ {
+ // there is no standard function to open wide paths, so our best bet is to try utf8 path
+ char* path_utf8 = convert_path_heap(path);
+ if (!path_utf8) return 0;
+
+ // convert mode to ASCII (we mirror _wfopen interface)
+ char mode_ascii[4] = {0};
+ for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast<char>(mode[i]);
+
+ // try to open the utf8 path
+ FILE* result = fopen(path_utf8, mode_ascii);
+
+ // free dummy buffer
+ xml_memory::deallocate(path_utf8);
+
+ return result;
+ }
+#endif
+
+ PUGI__FN bool save_file_impl(const xml_document& doc, FILE* file, const char_t* indent, unsigned int flags, xml_encoding encoding)
+ {
+ if (!file) return false;
+
+ xml_writer_file writer(file);
+ doc.save(writer, indent, flags, encoding);
+
+ int result = ferror(file);
+
+ fclose(file);
+
+ return result == 0;
+ }
+PUGI__NS_END
+
+namespace pugi
+{
+ PUGI__FN xml_writer_file::xml_writer_file(void* file_): file(file_)
+ {
+ }
+
+ PUGI__FN void xml_writer_file::write(const void* data, size_t size)
+ {
+ size_t result = fwrite(data, 1, size, static_cast<FILE*>(file));
+ (void)!result; // unfortunately we can't do proper error handling here
+ }
+
+#ifndef PUGIXML_NO_STL
+ PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream): narrow_stream(&stream), wide_stream(0)
+ {
+ }
+
+ PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream): narrow_stream(0), wide_stream(&stream)
+ {
+ }
+
+ PUGI__FN void xml_writer_stream::write(const void* data, size_t size)
+ {
+ if (narrow_stream)
+ {
+ assert(!wide_stream);
+ narrow_stream->write(reinterpret_cast<const char*>(data), static_cast<std::streamsize>(size));
+ }
+ else
+ {
+ assert(wide_stream);
+ assert(size % sizeof(wchar_t) == 0);
+
+ wide_stream->write(reinterpret_cast<const wchar_t*>(data), static_cast<std::streamsize>(size / sizeof(wchar_t)));
+ }
+ }
+#endif
+
+ PUGI__FN xml_tree_walker::xml_tree_walker(): _depth(0)
+ {
+ }
+
+ PUGI__FN xml_tree_walker::~xml_tree_walker()
+ {
+ }
+
+ PUGI__FN int xml_tree_walker::depth() const
+ {
+ return _depth;
+ }
+
+ PUGI__FN bool xml_tree_walker::begin(xml_node&)
+ {
+ return true;
+ }
+
+ PUGI__FN bool xml_tree_walker::end(xml_node&)
+ {
+ return true;
+ }
+
+ PUGI__FN xml_attribute::xml_attribute(): _attr(0)
+ {
+ }
+
+ PUGI__FN xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr)
+ {
+ }
+
+ PUGI__FN static void unspecified_bool_xml_attribute(xml_attribute***)
+ {
+ }
+
+ PUGI__FN xml_attribute::operator xml_attribute::unspecified_bool_type() const
+ {
+ return _attr ? unspecified_bool_xml_attribute : 0;
+ }
+
+ PUGI__FN bool xml_attribute::operator!() const
+ {
+ return !_attr;
+ }
+
+ PUGI__FN bool xml_attribute::operator==(const xml_attribute& r) const
+ {
+ return (_attr == r._attr);
+ }
+
+ PUGI__FN bool xml_attribute::operator!=(const xml_attribute& r) const
+ {
+ return (_attr != r._attr);
+ }
+
+ PUGI__FN bool xml_attribute::operator<(const xml_attribute& r) const
+ {
+ return (_attr < r._attr);
+ }
+
+ PUGI__FN bool xml_attribute::operator>(const xml_attribute& r) const
+ {
+ return (_attr > r._attr);
+ }
+
+ PUGI__FN bool xml_attribute::operator<=(const xml_attribute& r) const
+ {
+ return (_attr <= r._attr);
+ }
+
+ PUGI__FN bool xml_attribute::operator>=(const xml_attribute& r) const
+ {
+ return (_attr >= r._attr);
+ }
+
+ PUGI__FN xml_attribute xml_attribute::next_attribute() const
+ {
+ return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute();
+ }
+
+ PUGI__FN xml_attribute xml_attribute::previous_attribute() const
+ {
+ return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute();
+ }
+
+ PUGI__FN const char_t* xml_attribute::as_string(const char_t* def) const
+ {
+ return (_attr && _attr->value) ? _attr->value : def;
+ }
+
+ PUGI__FN int xml_attribute::as_int(int def) const
+ {
+ return impl::get_value_int(_attr ? _attr->value : 0, def);
+ }
+
+ PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const
+ {
+ return impl::get_value_uint(_attr ? _attr->value : 0, def);
+ }
+
+ PUGI__FN double xml_attribute::as_double(double def) const
+ {
+ return impl::get_value_double(_attr ? _attr->value : 0, def);
+ }
+
+ PUGI__FN float xml_attribute::as_float(float def) const
+ {
+ return impl::get_value_float(_attr ? _attr->value : 0, def);
+ }
+
+ PUGI__FN bool xml_attribute::as_bool(bool def) const
+ {
+ return impl::get_value_bool(_attr ? _attr->value : 0, def);
+ }
+
+ PUGI__FN bool xml_attribute::empty() const
+ {
+ return !_attr;
+ }
+
+ PUGI__FN const char_t* xml_attribute::name() const
+ {
+ return (_attr && _attr->name) ? _attr->name : PUGIXML_TEXT("");
+ }
+
+ PUGI__FN const char_t* xml_attribute::value() const
+ {
+ return (_attr && _attr->value) ? _attr->value : PUGIXML_TEXT("");
+ }
+
+ PUGI__FN size_t xml_attribute::hash_value() const
+ {
+ return static_cast<size_t>(reinterpret_cast<uintptr_t>(_attr) / sizeof(xml_attribute_struct));
+ }
+
+ PUGI__FN xml_attribute_struct* xml_attribute::internal_object() const
+ {
+ return _attr;
+ }
+
+ PUGI__FN xml_attribute& xml_attribute::operator=(const char_t* rhs)
+ {
+ set_value(rhs);
+ return *this;
+ }
+
+ PUGI__FN xml_attribute& xml_attribute::operator=(int rhs)
+ {
+ set_value(rhs);
+ return *this;
+ }
+
+ PUGI__FN xml_attribute& xml_attribute::operator=(unsigned int rhs)
+ {
+ set_value(rhs);
+ return *this;
+ }
+
+ PUGI__FN xml_attribute& xml_attribute::operator=(double rhs)
+ {
+ set_value(rhs);
+ return *this;
+ }
+
+ PUGI__FN xml_attribute& xml_attribute::operator=(bool rhs)
+ {
+ set_value(rhs);
+ return *this;
+ }
+
+ PUGI__FN bool xml_attribute::set_name(const char_t* rhs)
+ {
+ if (!_attr) return false;
+
+ return impl::strcpy_insitu(_attr->name, _attr->header, impl::xml_memory_page_name_allocated_mask, rhs);
+ }
+
+ PUGI__FN bool xml_attribute::set_value(const char_t* rhs)
+ {
+ if (!_attr) return false;
+
+ return impl::strcpy_insitu(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+ }
+
+ PUGI__FN bool xml_attribute::set_value(int rhs)
+ {
+ if (!_attr) return false;
+
+ return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+ }
+
+ PUGI__FN bool xml_attribute::set_value(unsigned int rhs)
+ {
+ if (!_attr) return false;
+
+ return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+ }
+
+ PUGI__FN bool xml_attribute::set_value(double rhs)
+ {
+ if (!_attr) return false;
+
+ return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+ }
+
+ PUGI__FN bool xml_attribute::set_value(bool rhs)
+ {
+ if (!_attr) return false;
+
+ return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+ }
+
+#ifdef __BORLANDC__
+ PUGI__FN bool operator&&(const xml_attribute& lhs, bool rhs)
+ {
+ return (bool)lhs && rhs;
+ }
+
+ PUGI__FN bool operator||(const xml_attribute& lhs, bool rhs)
+ {
+ return (bool)lhs || rhs;
+ }
+#endif
+
+ PUGI__FN xml_node::xml_node(): _root(0)
+ {
+ }
+
+ PUGI__FN xml_node::xml_node(xml_node_struct* p): _root(p)
+ {
+ }
+
+ PUGI__FN static void unspecified_bool_xml_node(xml_node***)
+ {
+ }
+
+ PUGI__FN xml_node::operator xml_node::unspecified_bool_type() const
+ {
+ return _root ? unspecified_bool_xml_node : 0;
+ }
+
+ PUGI__FN bool xml_node::operator!() const
+ {
+ return !_root;
+ }
+
+ PUGI__FN xml_node::iterator xml_node::begin() const
+ {
+ return iterator(_root ? _root->first_child : 0, _root);
+ }
+
+ PUGI__FN xml_node::iterator xml_node::end() const
+ {
+ return iterator(0, _root);
+ }
+
+ PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const
+ {
+ return attribute_iterator(_root ? _root->first_attribute : 0, _root);
+ }
+
+ PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const
+ {
+ return attribute_iterator(0, _root);
+ }
+
+ PUGI__FN xml_object_range<xml_node_iterator> xml_node::children() const
+ {
+ return xml_object_range<xml_node_iterator>(begin(), end());
+ }
+
+ PUGI__FN xml_object_range<xml_named_node_iterator> xml_node::children(const char_t* name_) const
+ {
+ return xml_object_range<xml_named_node_iterator>(xml_named_node_iterator(child(name_), name_), xml_named_node_iterator());
+ }
+
+ PUGI__FN xml_object_range<xml_attribute_iterator> xml_node::attributes() const
+ {
+ return xml_object_range<xml_attribute_iterator>(attributes_begin(), attributes_end());
+ }
+
+ PUGI__FN bool xml_node::operator==(const xml_node& r) const
+ {
+ return (_root == r._root);
+ }
+
+ PUGI__FN bool xml_node::operator!=(const xml_node& r) const
+ {
+ return (_root != r._root);
+ }
+
+ PUGI__FN bool xml_node::operator<(const xml_node& r) const
+ {
+ return (_root < r._root);
+ }
+
+ PUGI__FN bool xml_node::operator>(const xml_node& r) const
+ {
+ return (_root > r._root);
+ }
+
+ PUGI__FN bool xml_node::operator<=(const xml_node& r) const
+ {
+ return (_root <= r._root);
+ }
+
+ PUGI__FN bool xml_node::operator>=(const xml_node& r) const
+ {
+ return (_root >= r._root);
+ }
+
+ PUGI__FN bool xml_node::empty() const
+ {
+ return !_root;
+ }
+
+ PUGI__FN const char_t* xml_node::name() const
+ {
+ return (_root && _root->name) ? _root->name : PUGIXML_TEXT("");
+ }
+
+ PUGI__FN xml_node_type xml_node::type() const
+ {
+ return _root ? static_cast<xml_node_type>((_root->header & impl::xml_memory_page_type_mask) + 1) : node_null;
+ }
+
+ PUGI__FN const char_t* xml_node::value() const
+ {
+ return (_root && _root->value) ? _root->value : PUGIXML_TEXT("");
+ }
+
+ PUGI__FN xml_node xml_node::child(const char_t* name_) const
+ {
+ if (!_root) return xml_node();
+
+ for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+ if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+ return xml_node();
+ }
+
+ PUGI__FN xml_attribute xml_node::attribute(const char_t* name_) const
+ {
+ if (!_root) return xml_attribute();
+
+ for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute)
+ if (i->name && impl::strequal(name_, i->name))
+ return xml_attribute(i);
+
+ return xml_attribute();
+ }
+
+ PUGI__FN xml_node xml_node::next_sibling(const char_t* name_) const
+ {
+ if (!_root) return xml_node();
+
+ for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling)
+ if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+ return xml_node();
+ }
+
+ PUGI__FN xml_node xml_node::next_sibling() const
+ {
+ if (!_root) return xml_node();
+
+ if (_root->next_sibling) return xml_node(_root->next_sibling);
+ else return xml_node();
+ }
+
+ PUGI__FN xml_node xml_node::previous_sibling(const char_t* name_) const
+ {
+ if (!_root) return xml_node();
+
+ for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c)
+ if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+ return xml_node();
+ }
+
+ PUGI__FN xml_node xml_node::previous_sibling() const
+ {
+ if (!_root) return xml_node();
+
+ if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c);
+ else return xml_node();
+ }
+
+ PUGI__FN xml_node xml_node::parent() const
+ {
+ return _root ? xml_node(_root->parent) : xml_node();
+ }
+
+ PUGI__FN xml_node xml_node::root() const
+ {
+ if (!_root) return xml_node();
+
+ impl::xml_memory_page* page = reinterpret_cast<impl::xml_memory_page*>(_root->header & impl::xml_memory_page_pointer_mask);
+
+ return xml_node(static_cast<impl::xml_document_struct*>(page->allocator));
+ }
+
+ PUGI__FN xml_text xml_node::text() const
+ {
+ return xml_text(_root);
+ }
+
+ PUGI__FN const char_t* xml_node::child_value() const
+ {
+ if (!_root) return PUGIXML_TEXT("");
+
+ for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+ if (i->value && impl::is_text_node(i))
+ return i->value;
+
+ return PUGIXML_TEXT("");
+ }
+
+ PUGI__FN const char_t* xml_node::child_value(const char_t* name_) const
+ {
+ return child(name_).child_value();
+ }
+
+ PUGI__FN xml_attribute xml_node::first_attribute() const
+ {
+ return _root ? xml_attribute(_root->first_attribute) : xml_attribute();
+ }
+
+ PUGI__FN xml_attribute xml_node::last_attribute() const
+ {
+ return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute();
+ }
+
+ PUGI__FN xml_node xml_node::first_child() const
+ {
+ return _root ? xml_node(_root->first_child) : xml_node();
+ }
+
+ PUGI__FN xml_node xml_node::last_child() const
+ {
+ return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node();
+ }
+
+ PUGI__FN bool xml_node::set_name(const char_t* rhs)
+ {
+ switch (type())
+ {
+ case node_pi:
+ case node_declaration:
+ case node_element:
+ return impl::strcpy_insitu(_root->name, _root->header, impl::xml_memory_page_name_allocated_mask, rhs);
+
+ default:
+ return false;
+ }
+ }
+
+ PUGI__FN bool xml_node::set_value(const char_t* rhs)
+ {
+ switch (type())
+ {
+ case node_pi:
+ case node_cdata:
+ case node_pcdata:
+ case node_comment:
+ case node_doctype:
+ return impl::strcpy_insitu(_root->value, _root->header, impl::xml_memory_page_value_allocated_mask, rhs);
+
+ default:
+ return false;
+ }
+ }
+
+ PUGI__FN xml_attribute xml_node::append_attribute(const char_t* name_)
+ {
+ if (type() != node_element && type() != node_declaration) return xml_attribute();
+
+ xml_attribute a(impl::append_attribute_ll(_root, impl::get_allocator(_root)));
+ a.set_name(name_);
+
+ return a;
+ }
+
+ PUGI__FN xml_attribute xml_node::prepend_attribute(const char_t* name_)
+ {
+ if (type() != node_element && type() != node_declaration) return xml_attribute();
+
+ xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+ if (!a) return xml_attribute();
+
+ a.set_name(name_);
+
+ xml_attribute_struct* head = _root->first_attribute;
+
+ if (head)
+ {
+ a._attr->prev_attribute_c = head->prev_attribute_c;
+ head->prev_attribute_c = a._attr;
+ }
+ else
+ a._attr->prev_attribute_c = a._attr;
+
+ a._attr->next_attribute = head;
+ _root->first_attribute = a._attr;
+
+ return a;
+ }
+
+ PUGI__FN xml_attribute xml_node::insert_attribute_before(const char_t* name_, const xml_attribute& attr)
+ {
+ if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute();
+
+ // check that attribute belongs to *this
+ xml_attribute_struct* cur = attr._attr;
+
+ while (cur->prev_attribute_c->next_attribute) cur = cur->prev_attribute_c;
+
+ if (cur != _root->first_attribute) return xml_attribute();
+
+ xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+ if (!a) return xml_attribute();
+
+ a.set_name(name_);
+
+ if (attr._attr->prev_attribute_c->next_attribute)
+ attr._attr->prev_attribute_c->next_attribute = a._attr;
+ else
+ _root->first_attribute = a._attr;
+
+ a._attr->prev_attribute_c = attr._attr->prev_attribute_c;
+ a._attr->next_attribute = attr._attr;
+ attr._attr->prev_attribute_c = a._attr;
+
+ return a;
+ }
+
+ PUGI__FN xml_attribute xml_node::insert_attribute_after(const char_t* name_, const xml_attribute& attr)
+ {
+ if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute();
+
+ // check that attribute belongs to *this
+ xml_attribute_struct* cur = attr._attr;
+
+ while (cur->prev_attribute_c->next_attribute) cur = cur->prev_attribute_c;
+
+ if (cur != _root->first_attribute) return xml_attribute();
+
+ xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+ if (!a) return xml_attribute();
+
+ a.set_name(name_);
+
+ if (attr._attr->next_attribute)
+ attr._attr->next_attribute->prev_attribute_c = a._attr;
+ else
+ _root->first_attribute->prev_attribute_c = a._attr;
+
+ a._attr->next_attribute = attr._attr->next_attribute;
+ a._attr->prev_attribute_c = attr._attr;
+ attr._attr->next_attribute = a._attr;
+
+ return a;
+ }
+
+ PUGI__FN xml_attribute xml_node::append_copy(const xml_attribute& proto)
+ {
+ if (!proto) return xml_attribute();
+
+ xml_attribute result = append_attribute(proto.name());
+ result.set_value(proto.value());
+
+ return result;
+ }
+
+ PUGI__FN xml_attribute xml_node::prepend_copy(const xml_attribute& proto)
+ {
+ if (!proto) return xml_attribute();
+
+ xml_attribute result = prepend_attribute(proto.name());
+ result.set_value(proto.value());
+
+ return result;
+ }
+
+ PUGI__FN xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr)
+ {
+ if (!proto) return xml_attribute();
+
+ xml_attribute result = insert_attribute_after(proto.name(), attr);
+ result.set_value(proto.value());
+
+ return result;
+ }
+
+ PUGI__FN xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr)
+ {
+ if (!proto) return xml_attribute();
+
+ xml_attribute result = insert_attribute_before(proto.name(), attr);
+ result.set_value(proto.value());
+
+ return result;
+ }
+
+ PUGI__FN xml_node xml_node::append_child(xml_node_type type_)
+ {
+ if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+
+ xml_node n(impl::append_node(_root, impl::get_allocator(_root), type_));
+
+ if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+ return n;
+ }
+
+ PUGI__FN xml_node xml_node::prepend_child(xml_node_type type_)
+ {
+ if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+
+ xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+ if (!n) return xml_node();
+
+ n._root->parent = _root;
+
+ xml_node_struct* head = _root->first_child;
+
+ if (head)
+ {
+ n._root->prev_sibling_c = head->prev_sibling_c;
+ head->prev_sibling_c = n._root;
+ }
+ else
+ n._root->prev_sibling_c = n._root;
+
+ n._root->next_sibling = head;
+ _root->first_child = n._root;
+
+ if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+ return n;
+ }
+
+ PUGI__FN xml_node xml_node::insert_child_before(xml_node_type type_, const xml_node& node)
+ {
+ if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+ if (!node._root || node._root->parent != _root) return xml_node();
+
+ xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+ if (!n) return xml_node();
+
+ n._root->parent = _root;
+
+ if (node._root->prev_sibling_c->next_sibling)
+ node._root->prev_sibling_c->next_sibling = n._root;
+ else
+ _root->first_child = n._root;
+
+ n._root->prev_sibling_c = node._root->prev_sibling_c;
+ n._root->next_sibling = node._root;
+ node._root->prev_sibling_c = n._root;
+
+ if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+ return n;
+ }
+
+ PUGI__FN xml_node xml_node::insert_child_after(xml_node_type type_, const xml_node& node)
+ {
+ if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+ if (!node._root || node._root->parent != _root) return xml_node();
+
+ xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+ if (!n) return xml_node();
+
+ n._root->parent = _root;
+
+ if (node._root->next_sibling)
+ node._root->next_sibling->prev_sibling_c = n._root;
+ else
+ _root->first_child->prev_sibling_c = n._root;
+
+ n._root->next_sibling = node._root->next_sibling;
+ n._root->prev_sibling_c = node._root;
+ node._root->next_sibling = n._root;
+
+ if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+ return n;
+ }
+
+ PUGI__FN xml_node xml_node::append_child(const char_t* name_)
+ {
+ xml_node result = append_child(node_element);
+
+ result.set_name(name_);
+
+ return result;
+ }
+
+ PUGI__FN xml_node xml_node::prepend_child(const char_t* name_)
+ {
+ xml_node result = prepend_child(node_element);
+
+ result.set_name(name_);
+
+ return result;
+ }
+
+ PUGI__FN xml_node xml_node::insert_child_after(const char_t* name_, const xml_node& node)
+ {
+ xml_node result = insert_child_after(node_element, node);
+
+ result.set_name(name_);
+
+ return result;
+ }
+
+ PUGI__FN xml_node xml_node::insert_child_before(const char_t* name_, const xml_node& node)
+ {
+ xml_node result = insert_child_before(node_element, node);
+
+ result.set_name(name_);
+
+ return result;
+ }
+
+ PUGI__FN xml_node xml_node::append_copy(const xml_node& proto)
+ {
+ xml_node result = append_child(proto.type());
+
+ if (result) impl::recursive_copy_skip(result, proto, result);
+
+ return result;
+ }
+
+ PUGI__FN xml_node xml_node::prepend_copy(const xml_node& proto)
+ {
+ xml_node result = prepend_child(proto.type());
+
+ if (result) impl::recursive_copy_skip(result, proto, result);
+
+ return result;
+ }
+
+ PUGI__FN xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node)
+ {
+ xml_node result = insert_child_after(proto.type(), node);
+
+ if (result) impl::recursive_copy_skip(result, proto, result);
+
+ return result;
+ }
+
+ PUGI__FN xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node)
+ {
+ xml_node result = insert_child_before(proto.type(), node);
+
+ if (result) impl::recursive_copy_skip(result, proto, result);
+
+ return result;
+ }
+
+ PUGI__FN bool xml_node::remove_attribute(const char_t* name_)
+ {
+ return remove_attribute(attribute(name_));
+ }
+
+ PUGI__FN bool xml_node::remove_attribute(const xml_attribute& a)
+ {
+ if (!_root || !a._attr) return false;
+
+ // check that attribute belongs to *this
+ xml_attribute_struct* attr = a._attr;
+
+ while (attr->prev_attribute_c->next_attribute) attr = attr->prev_attribute_c;
+
+ if (attr != _root->first_attribute) return false;
+
+ if (a._attr->next_attribute) a._attr->next_attribute->prev_attribute_c = a._attr->prev_attribute_c;
+ else if (_root->first_attribute) _root->first_attribute->prev_attribute_c = a._attr->prev_attribute_c;
+
+ if (a._attr->prev_attribute_c->next_attribute) a._attr->prev_attribute_c->next_attribute = a._attr->next_attribute;
+ else _root->first_attribute = a._attr->next_attribute;
+
+ impl::destroy_attribute(a._attr, impl::get_allocator(_root));
+
+ return true;
+ }
+
+ PUGI__FN bool xml_node::remove_child(const char_t* name_)
+ {
+ return remove_child(child(name_));
+ }
+
+ PUGI__FN bool xml_node::remove_child(const xml_node& n)
+ {
+ if (!_root || !n._root || n._root->parent != _root) return false;
+
+ if (n._root->next_sibling) n._root->next_sibling->prev_sibling_c = n._root->prev_sibling_c;
+ else if (_root->first_child) _root->first_child->prev_sibling_c = n._root->prev_sibling_c;
+
+ if (n._root->prev_sibling_c->next_sibling) n._root->prev_sibling_c->next_sibling = n._root->next_sibling;
+ else _root->first_child = n._root->next_sibling;
+
+ impl::destroy_node(n._root, impl::get_allocator(_root));
+
+ return true;
+ }
+
+ PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* name_, const char_t* attr_name, const char_t* attr_value) const
+ {
+ if (!_root) return xml_node();
+
+ for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+ if (i->name && impl::strequal(name_, i->name))
+ {
+ for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+ if (impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value))
+ return xml_node(i);
+ }
+
+ return xml_node();
+ }
+
+ PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const
+ {
+ if (!_root) return xml_node();
+
+ for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+ for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+ if (impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value))
+ return xml_node(i);
+
+ return xml_node();
+ }
+
+#ifndef PUGIXML_NO_STL
+ PUGI__FN string_t xml_node::path(char_t delimiter) const
+ {
+ xml_node cursor = *this; // Make a copy.
+
+ string_t result = cursor.name();
+
+ while (cursor.parent())
+ {
+ cursor = cursor.parent();
+
+ string_t temp = cursor.name();
+ temp += delimiter;
+ temp += result;
+ result.swap(temp);
+ }
+
+ return result;
+ }
+#endif
+
+ PUGI__FN xml_node xml_node::first_element_by_path(const char_t* path_, char_t delimiter) const
+ {
+ xml_node found = *this; // Current search context.
+
+ if (!_root || !path_ || !path_[0]) return found;
+
+ if (path_[0] == delimiter)
+ {
+ // Absolute path; e.g. '/foo/bar'
+ found = found.root();
+ ++path_;
+ }
+
+ const char_t* path_segment = path_;
+
+ while (*path_segment == delimiter) ++path_segment;
+
+ const char_t* path_segment_end = path_segment;
+
+ while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end;
+
+ if (path_segment == path_segment_end) return found;
+
+ const char_t* next_segment = path_segment_end;
+
+ while (*next_segment == delimiter) ++next_segment;
+
+ if (*path_segment == '.' && path_segment + 1 == path_segment_end)
+ return found.first_element_by_path(next_segment, delimiter);
+ else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end)
+ return found.parent().first_element_by_path(next_segment, delimiter);
+ else
+ {
+ for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling)
+ {
+ if (j->name && impl::strequalrange(j->name, path_segment, static_cast<size_t>(path_segment_end - path_segment)))
+ {
+ xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter);
+
+ if (subsearch) return subsearch;
+ }
+ }
+
+ return xml_node();
+ }
+ }
+
+ PUGI__FN bool xml_node::traverse(xml_tree_walker& walker)
+ {
+ walker._depth = -1;
+
+ xml_node arg_begin = *this;
+ if (!walker.begin(arg_begin)) return false;
+
+ xml_node cur = first_child();
+
+ if (cur)
+ {
+ ++walker._depth;
+
+ do
+ {
+ xml_node arg_for_each = cur;
+ if (!walker.for_each(arg_for_each))
+ return false;
+
+ if (cur.first_child())
+ {
+ ++walker._depth;
+ cur = cur.first_child();
+ }
+ else if (cur.next_sibling())
+ cur = cur.next_sibling();
+ else
+ {
+ // Borland C++ workaround
+ while (!cur.next_sibling() && cur != *this && !cur.parent().empty())
+ {
+ --walker._depth;
+ cur = cur.parent();
+ }
+
+ if (cur != *this)
+ cur = cur.next_sibling();
+ }
+ }
+ while (cur && cur != *this);
+ }
+
+ assert(walker._depth == -1);
+
+ xml_node arg_end = *this;
+ return walker.end(arg_end);
+ }
+
+ PUGI__FN size_t xml_node::hash_value() const
+ {
+ return static_cast<size_t>(reinterpret_cast<uintptr_t>(_root) / sizeof(xml_node_struct));
+ }
+
+ PUGI__FN xml_node_struct* xml_node::internal_object() const
+ {
+ return _root;
+ }
+
+ PUGI__FN void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+ {
+ if (!_root) return;
+
+ impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+ impl::node_output(buffered_writer, *this, indent, flags, depth);
+ }
+
+#ifndef PUGIXML_NO_STL
+ PUGI__FN void xml_node::print(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+ {
+ xml_writer_stream writer(stream);
+
+ print(writer, indent, flags, encoding, depth);
+ }
+
+ PUGI__FN void xml_node::print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const
+ {
+ xml_writer_stream writer(stream);
+
+ print(writer, indent, flags, encoding_wchar, depth);
+ }
+#endif
+
+ PUGI__FN ptrdiff_t xml_node::offset_debug() const
+ {
+ xml_node_struct* r = root()._root;
+
+ if (!r) return -1;
+
+ const char_t* buffer = static_cast<impl::xml_document_struct*>(r)->buffer;
+
+ if (!buffer) return -1;
+
+ switch (type())
+ {
+ case node_document:
+ return 0;
+
+ case node_element:
+ case node_declaration:
+ case node_pi:
+ return (_root->header & impl::xml_memory_page_name_allocated_mask) ? -1 : _root->name - buffer;
+
+ case node_pcdata:
+ case node_cdata:
+ case node_comment:
+ case node_doctype:
+ return (_root->header & impl::xml_memory_page_value_allocated_mask) ? -1 : _root->value - buffer;
+
+ default:
+ return -1;
+ }
+ }
+
+#ifdef __BORLANDC__
+ PUGI__FN bool operator&&(const xml_node& lhs, bool rhs)
+ {
+ return (bool)lhs && rhs;
+ }
+
+ PUGI__FN bool operator||(const xml_node& lhs, bool rhs)
+ {
+ return (bool)lhs || rhs;
+ }
+#endif
+
+ PUGI__FN xml_text::xml_text(xml_node_struct* root): _root(root)
+ {
+ }
+
+ PUGI__FN xml_node_struct* xml_text::_data() const
+ {
+ if (!_root || impl::is_text_node(_root)) return _root;
+
+ for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling)
+ if (impl::is_text_node(node))
+ return node;
+
+ return 0;
+ }
+
+ PUGI__FN xml_node_struct* xml_text::_data_new()
+ {
+ xml_node_struct* d = _data();
+ if (d) return d;
+
+ return xml_node(_root).append_child(node_pcdata).internal_object();
+ }
+
+ PUGI__FN xml_text::xml_text(): _root(0)
+ {
+ }
+
+ PUGI__FN static void unspecified_bool_xml_text(xml_text***)
+ {
+ }
+
+ PUGI__FN xml_text::operator xml_text::unspecified_bool_type() const
+ {
+ return _data() ? unspecified_bool_xml_text : 0;
+ }
+
+ PUGI__FN bool xml_text::operator!() const
+ {
+ return !_data();
+ }
+
+ PUGI__FN bool xml_text::empty() const
+ {
+ return _data() == 0;
+ }
+
+ PUGI__FN const char_t* xml_text::get() const
+ {
+ xml_node_struct* d = _data();
+
+ return (d && d->value) ? d->value : PUGIXML_TEXT("");
+ }
+
+ PUGI__FN const char_t* xml_text::as_string(const char_t* def) const
+ {
+ xml_node_struct* d = _data();
+
+ return (d && d->value) ? d->value : def;
+ }
+
+ PUGI__FN int xml_text::as_int(int def) const
+ {
+ xml_node_struct* d = _data();
+
+ return impl::get_value_int(d ? d->value : 0, def);
+ }
+
+ PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const
+ {
+ xml_node_struct* d = _data();
+
+ return impl::get_value_uint(d ? d->value : 0, def);
+ }
+
+ PUGI__FN double xml_text::as_double(double def) const
+ {
+ xml_node_struct* d = _data();
+
+ return impl::get_value_double(d ? d->value : 0, def);
+ }
+
+ PUGI__FN float xml_text::as_float(float def) const
+ {
+ xml_node_struct* d = _data();
+
+ return impl::get_value_float(d ? d->value : 0, def);
+ }
+
+ PUGI__FN bool xml_text::as_bool(bool def) const
+ {
+ xml_node_struct* d = _data();
+
+ return impl::get_value_bool(d ? d->value : 0, def);
+ }
+
+ PUGI__FN bool xml_text::set(const char_t* rhs)
+ {
+ xml_node_struct* dn = _data_new();
+
+ return dn ? impl::strcpy_insitu(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+ }
+
+ PUGI__FN bool xml_text::set(int rhs)
+ {
+ xml_node_struct* dn = _data_new();
+
+ return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+ }
+
+ PUGI__FN bool xml_text::set(unsigned int rhs)
+ {
+ xml_node_struct* dn = _data_new();
+
+ return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+ }
+
+ PUGI__FN bool xml_text::set(double rhs)
+ {
+ xml_node_struct* dn = _data_new();
+
+ return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+ }
+
+ PUGI__FN bool xml_text::set(bool rhs)
+ {
+ xml_node_struct* dn = _data_new();
+
+ return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+ }
+
+ PUGI__FN xml_text& xml_text::operator=(const char_t* rhs)
+ {
+ set(rhs);
+ return *this;
+ }
+
+ PUGI__FN xml_text& xml_text::operator=(int rhs)
+ {
+ set(rhs);
+ return *this;
+ }
+
+ PUGI__FN xml_text& xml_text::operator=(unsigned int rhs)
+ {
+ set(rhs);
+ return *this;
+ }
+
+ PUGI__FN xml_text& xml_text::operator=(double rhs)
+ {
+ set(rhs);
+ return *this;
+ }
+
+ PUGI__FN xml_text& xml_text::operator=(bool rhs)
+ {
+ set(rhs);
+ return *this;
+ }
+
+ PUGI__FN xml_node xml_text::data() const
+ {
+ return xml_node(_data());
+ }
+
+#ifdef __BORLANDC__
+ PUGI__FN bool operator&&(const xml_text& lhs, bool rhs)
+ {
+ return (bool)lhs && rhs;
+ }
+
+ PUGI__FN bool operator||(const xml_text& lhs, bool rhs)
+ {
+ return (bool)lhs || rhs;
+ }
+#endif
+
+ PUGI__FN xml_node_iterator::xml_node_iterator()
+ {
+ }
+
+ PUGI__FN xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent())
+ {
+ }
+
+ PUGI__FN xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+ {
+ }
+
+ PUGI__FN bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const
+ {
+ return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+ }
+
+ PUGI__FN bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const
+ {
+ return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+ }
+
+ PUGI__FN xml_node& xml_node_iterator::operator*() const
+ {
+ assert(_wrap._root);
+ return _wrap;
+ }
+
+ PUGI__FN xml_node* xml_node_iterator::operator->() const
+ {
+ assert(_wrap._root);
+ return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+ }
+
+ PUGI__FN const xml_node_iterator& xml_node_iterator::operator++()
+ {
+ assert(_wrap._root);
+ _wrap._root = _wrap._root->next_sibling;
+ return *this;
+ }
+
+ PUGI__FN xml_node_iterator xml_node_iterator::operator++(int)
+ {
+ xml_node_iterator temp = *this;
+ ++*this;
+ return temp;
+ }
+
+ PUGI__FN const xml_node_iterator& xml_node_iterator::operator--()
+ {
+ _wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child();
+ return *this;
+ }
+
+ PUGI__FN xml_node_iterator xml_node_iterator::operator--(int)
+ {
+ xml_node_iterator temp = *this;
+ --*this;
+ return temp;
+ }
+
+ PUGI__FN xml_attribute_iterator::xml_attribute_iterator()
+ {
+ }
+
+ PUGI__FN xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent)
+ {
+ }
+
+ PUGI__FN xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+ {
+ }
+
+ PUGI__FN bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const
+ {
+ return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root;
+ }
+
+ PUGI__FN bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const
+ {
+ return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root;
+ }
+
+ PUGI__FN xml_attribute& xml_attribute_iterator::operator*() const
+ {
+ assert(_wrap._attr);
+ return _wrap;
+ }
+
+ PUGI__FN xml_attribute* xml_attribute_iterator::operator->() const
+ {
+ assert(_wrap._attr);
+ return const_cast<xml_attribute*>(&_wrap); // BCC32 workaround
+ }
+
+ PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator++()
+ {
+ assert(_wrap._attr);
+ _wrap._attr = _wrap._attr->next_attribute;
+ return *this;
+ }
+
+ PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator++(int)
+ {
+ xml_attribute_iterator temp = *this;
+ ++*this;
+ return temp;
+ }
+
+ PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator--()
+ {
+ _wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute();
+ return *this;
+ }
+
+ PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator--(int)
+ {
+ xml_attribute_iterator temp = *this;
+ --*this;
+ return temp;
+ }
+
+ PUGI__FN xml_named_node_iterator::xml_named_node_iterator(): _name(0)
+ {
+ }
+
+ PUGI__FN xml_named_node_iterator::xml_named_node_iterator(const xml_node& node, const char_t* name): _node(node), _name(name)
+ {
+ }
+
+ PUGI__FN bool xml_named_node_iterator::operator==(const xml_named_node_iterator& rhs) const
+ {
+ return _node == rhs._node;
+ }
+
+ PUGI__FN bool xml_named_node_iterator::operator!=(const xml_named_node_iterator& rhs) const
+ {
+ return _node != rhs._node;
+ }
+
+ PUGI__FN xml_node& xml_named_node_iterator::operator*() const
+ {
+ assert(_node._root);
+ return _node;
+ }
+
+ PUGI__FN xml_node* xml_named_node_iterator::operator->() const
+ {
+ assert(_node._root);
+ return const_cast<xml_node*>(&_node); // BCC32 workaround
+ }
+
+ PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator++()
+ {
+ assert(_node._root);
+ _node = _node.next_sibling(_name);
+ return *this;
+ }
+
+ PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator++(int)
+ {
+ xml_named_node_iterator temp = *this;
+ ++*this;
+ return temp;
+ }
+
+ PUGI__FN xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto)
+ {
+ }
+
+ PUGI__FN xml_parse_result::operator bool() const
+ {
+ return status == status_ok;
+ }
+
+ PUGI__FN const char* xml_parse_result::description() const
+ {
+ switch (status)
+ {
+ case status_ok: return "No error";
+
+ case status_file_not_found: return "File was not found";
+ case status_io_error: return "Error reading from file/stream";
+ case status_out_of_memory: return "Could not allocate memory";
+ case status_internal_error: return "Internal error occurred";
+
+ case status_unrecognized_tag: return "Could not determine tag type";
+
+ case status_bad_pi: return "Error parsing document declaration/processing instruction";
+ case status_bad_comment: return "Error parsing comment";
+ case status_bad_cdata: return "Error parsing CDATA section";
+ case status_bad_doctype: return "Error parsing document type declaration";
+ case status_bad_pcdata: return "Error parsing PCDATA section";
+ case status_bad_start_element: return "Error parsing start element tag";
+ case status_bad_attribute: return "Error parsing element attribute";
+ case status_bad_end_element: return "Error parsing end element tag";
+ case status_end_element_mismatch: return "Start-end tags mismatch";
+
+ default: return "Unknown error";
+ }
+ }
+
+ PUGI__FN xml_document::xml_document(): _buffer(0)
+ {
+ create();
+ }
+
+ PUGI__FN xml_document::~xml_document()
+ {
+ destroy();
+ }
+
+ PUGI__FN void xml_document::reset()
+ {
+ destroy();
+ create();
+ }
+
+ PUGI__FN void xml_document::reset(const xml_document& proto)
+ {
+ reset();
+
+ for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling())
+ append_copy(cur);
+ }
+
+ PUGI__FN void xml_document::create()
+ {
+ // initialize sentinel page
+ PUGI__STATIC_ASSERT(offsetof(impl::xml_memory_page, data) + sizeof(impl::xml_document_struct) + impl::xml_memory_page_alignment <= sizeof(_memory));
+
+ // align upwards to page boundary
+ void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(_memory) + (impl::xml_memory_page_alignment - 1)) & ~(impl::xml_memory_page_alignment - 1));
+
+ // prepare page structure
+ impl::xml_memory_page* page = impl::xml_memory_page::construct(page_memory);
+
+ page->busy_size = impl::xml_memory_page_size;
+
+ // allocate new root
+ _root = new (page->data) impl::xml_document_struct(page);
+ _root->prev_sibling_c = _root;
+
+ // setup sentinel page
+ page->allocator = static_cast<impl::xml_document_struct*>(_root);
+ }
+
+ PUGI__FN void xml_document::destroy()
+ {
+ // destroy static storage
+ if (_buffer)
+ {
+ impl::xml_memory::deallocate(_buffer);
+ _buffer = 0;
+ }
+
+ // destroy dynamic storage, leave sentinel page (it's in static memory)
+ if (_root)
+ {
+ impl::xml_memory_page* root_page = reinterpret_cast<impl::xml_memory_page*>(_root->header & impl::xml_memory_page_pointer_mask);
+ assert(root_page && !root_page->prev && !root_page->memory);
+
+ // destroy all pages
+ for (impl::xml_memory_page* page = root_page->next; page; )
+ {
+ impl::xml_memory_page* next = page->next;
+
+ impl::xml_allocator::deallocate_page(page);
+
+ page = next;
+ }
+
+ // cleanup root page
+ root_page->allocator = 0;
+ root_page->next = 0;
+ root_page->busy_size = root_page->freed_size = 0;
+
+ _root = 0;
+ }
+ }
+
+#ifndef PUGIXML_NO_STL
+ PUGI__FN xml_parse_result xml_document::load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options, xml_encoding encoding)
+ {
+ reset();
+
+ return impl::load_stream_impl(*this, stream, options, encoding);
+ }
+
+ PUGI__FN xml_parse_result xml_document::load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options)
+ {
+ reset();
+
+ return impl::load_stream_impl(*this, stream, options, encoding_wchar);
+ }
+#endif
+
+ PUGI__FN xml_parse_result xml_document::load(const char_t* contents, unsigned int options)
+ {
+ // Force native encoding (skip autodetection)
+ #ifdef PUGIXML_WCHAR_MODE
+ xml_encoding encoding = encoding_wchar;
+ #else
+ xml_encoding encoding = encoding_utf8;
+ #endif
+
+ return load_buffer(contents, impl::strlength(contents) * sizeof(char_t), options, encoding);
+ }
+
+ PUGI__FN xml_parse_result xml_document::load_file(const char* path_, unsigned int options, xml_encoding encoding)
+ {
+ reset();
+
+ FILE* file = fopen(path_, "rb");
+
+ return impl::load_file_impl(*this, file, options, encoding);
+ }
+
+ PUGI__FN xml_parse_result xml_document::load_file(const wchar_t* path_, unsigned int options, xml_encoding encoding)
+ {
+ reset();
+
+ FILE* file = impl::open_file_wide(path_, L"rb");
+
+ return impl::load_file_impl(*this, file, options, encoding);
+ }
+
+ PUGI__FN xml_parse_result xml_document::load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own)
+ {
+ reset();
+
+ // check input buffer
+ assert(contents || size == 0);
+
+ // get actual encoding
+ xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size);
+
+ // get private buffer
+ char_t* buffer = 0;
+ size_t length = 0;
+
+ if (!impl::convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return impl::make_parse_result(status_out_of_memory);
+
+ // delete original buffer if we performed a conversion
+ if (own && buffer != contents && contents) impl::xml_memory::deallocate(contents);
+
+ // parse
+ xml_parse_result res = impl::xml_parser::parse(buffer, length, _root, options);
+
+ // remember encoding
+ res.encoding = buffer_encoding;
+
+ // grab onto buffer if it's our buffer, user is responsible for deallocating contens himself
+ if (own || buffer != contents) _buffer = buffer;
+
+ return res;
+ }
+
+ PUGI__FN xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+ {
+ return load_buffer_impl(const_cast<void*>(contents), size, options, encoding, false, false);
+ }
+
+ PUGI__FN xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+ {
+ return load_buffer_impl(contents, size, options, encoding, true, false);
+ }
+
+ PUGI__FN xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+ {
+ return load_buffer_impl(contents, size, options, encoding, true, true);
+ }
+
+ PUGI__FN void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+ {
+ impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+ if ((flags & format_write_bom) && encoding != encoding_latin1)
+ {
+ // BOM always represents the codepoint U+FEFF, so just write it in native encoding
+ #ifdef PUGIXML_WCHAR_MODE
+ unsigned int bom = 0xfeff;
+ buffered_writer.write(static_cast<wchar_t>(bom));
+ #else
+ buffered_writer.write('\xef', '\xbb', '\xbf');
+ #endif
+ }
+
+ if (!(flags & format_no_declaration) && !impl::has_declaration(*this))
+ {
+ buffered_writer.write(PUGIXML_TEXT("<?xml version=\"1.0\""));
+ if (encoding == encoding_latin1) buffered_writer.write(PUGIXML_TEXT(" encoding=\"ISO-8859-1\""));
+ buffered_writer.write('?', '>');
+ if (!(flags & format_raw)) buffered_writer.write('\n');
+ }
+
+ impl::node_output(buffered_writer, *this, indent, flags, 0);
+ }
+
+#ifndef PUGIXML_NO_STL
+ PUGI__FN void xml_document::save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+ {
+ xml_writer_stream writer(stream);
+
+ save(writer, indent, flags, encoding);
+ }
+
+ PUGI__FN void xml_document::save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags) const
+ {
+ xml_writer_stream writer(stream);
+
+ save(writer, indent, flags, encoding_wchar);
+ }
+#endif
+
+ PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+ {
+ FILE* file = fopen(path_, (flags & format_save_file_text) ? "w" : "wb");
+ return impl::save_file_impl(*this, file, indent, flags, encoding);
+ }
+
+ PUGI__FN bool xml_document::save_file(const wchar_t* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+ {
+ FILE* file = impl::open_file_wide(path_, (flags & format_save_file_text) ? L"w" : L"wb");
+ return impl::save_file_impl(*this, file, indent, flags, encoding);
+ }
+
+ PUGI__FN xml_node xml_document::document_element() const
+ {
+ for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+ if ((i->header & impl::xml_memory_page_type_mask) + 1 == node_element)
+ return xml_node(i);
+
+ return xml_node();
+ }
+
+#ifndef PUGIXML_NO_STL
+ PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str)
+ {
+ assert(str);
+
+ return impl::as_utf8_impl(str, wcslen(str));
+ }
+
+ PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t>& str)
+ {
+ return impl::as_utf8_impl(str.c_str(), str.size());
+ }
+
+ PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const char* str)
+ {
+ assert(str);
+
+ return impl::as_wide_impl(str, strlen(str));
+ }
+
+ PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const std::string& str)
+ {
+ return impl::as_wide_impl(str.c_str(), str.size());
+ }
+#endif
+
+ PUGI__FN void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate)
+ {
+ impl::xml_memory::allocate = allocate;
+ impl::xml_memory::deallocate = deallocate;
+ }
+
+ PUGI__FN allocation_function PUGIXML_FUNCTION get_memory_allocation_function()
+ {
+ return impl::xml_memory::allocate;
+ }
+
+ PUGI__FN deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function()
+ {
+ return impl::xml_memory::deallocate;
+ }
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+ // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+ PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&)
+ {
+ return std::bidirectional_iterator_tag();
+ }
+
+ PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&)
+ {
+ return std::bidirectional_iterator_tag();
+ }
+
+ PUGI__FN std::forward_iterator_tag _Iter_cat(const pugi::xml_named_node_iterator&)
+ {
+ return std::forward_iterator_tag();
+ }
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+ // Workarounds for (non-standard) iterator category detection
+ PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_node_iterator&)
+ {
+ return std::bidirectional_iterator_tag();
+ }
+
+ PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_attribute_iterator&)
+ {
+ return std::bidirectional_iterator_tag();
+ }
+
+ PUGI__FN std::forward_iterator_tag __iterator_category(const pugi::xml_named_node_iterator&)
+ {
+ return std::forward_iterator_tag();
+ }
+}
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+
+// STL replacements
+PUGI__NS_BEGIN
+ struct equal_to
+ {
+ template <typename T> bool operator()(const T& lhs, const T& rhs) const
+ {
+ return lhs == rhs;
+ }
+ };
+
+ struct not_equal_to
+ {
+ template <typename T> bool operator()(const T& lhs, const T& rhs) const
+ {
+ return lhs != rhs;
+ }
+ };
+
+ struct less
+ {
+ template <typename T> bool operator()(const T& lhs, const T& rhs) const
+ {
+ return lhs < rhs;
+ }
+ };
+
+ struct less_equal
+ {
+ template <typename T> bool operator()(const T& lhs, const T& rhs) const
+ {
+ return lhs <= rhs;
+ }
+ };
+
+ template <typename T> void swap(T& lhs, T& rhs)
+ {
+ T temp = lhs;
+ lhs = rhs;
+ rhs = temp;
+ }
+
+ template <typename I, typename Pred> I min_element(I begin, I end, const Pred& pred)
+ {
+ I result = begin;
+
+ for (I it = begin + 1; it != end; ++it)
+ if (pred(*it, *result))
+ result = it;
+
+ return result;
+ }
+
+ template <typename I> void reverse(I begin, I end)
+ {
+ while (begin + 1 < end) swap(*begin++, *--end);
+ }
+
+ template <typename I> I unique(I begin, I end)
+ {
+ // fast skip head
+ while (begin + 1 < end && *begin != *(begin + 1)) begin++;
+
+ if (begin == end) return begin;
+
+ // last written element
+ I write = begin++;
+
+ // merge unique elements
+ while (begin != end)
+ {
+ if (*begin != *write)
+ *++write = *begin++;
+ else
+ begin++;
+ }
+
+ // past-the-end (write points to live element)
+ return write + 1;
+ }
+
+ template <typename I> void copy_backwards(I begin, I end, I target)
+ {
+ while (begin != end) *--target = *--end;
+ }
+
+ template <typename I, typename Pred, typename T> void insertion_sort(I begin, I end, const Pred& pred, T*)
+ {
+ assert(begin != end);
+
+ for (I it = begin + 1; it != end; ++it)
+ {
+ T val = *it;
+
+ if (pred(val, *begin))
+ {
+ // move to front
+ copy_backwards(begin, it, it + 1);
+ *begin = val;
+ }
+ else
+ {
+ I hole = it;
+
+ // move hole backwards
+ while (pred(val, *(hole - 1)))
+ {
+ *hole = *(hole - 1);
+ hole--;
+ }
+
+ // fill hole with element
+ *hole = val;
+ }
+ }
+ }
+
+ // std variant for elements with ==
+ template <typename I, typename Pred> void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend)
+ {
+ I eqbeg = middle, eqend = middle + 1;
+
+ // expand equal range
+ while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg;
+ while (eqend != end && *eqend == *eqbeg) ++eqend;
+
+ // process outer elements
+ I ltend = eqbeg, gtbeg = eqend;
+
+ for (;;)
+ {
+ // find the element from the right side that belongs to the left one
+ for (; gtbeg != end; ++gtbeg)
+ if (!pred(*eqbeg, *gtbeg))
+ {
+ if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++);
+ else break;
+ }
+
+ // find the element from the left side that belongs to the right one
+ for (; ltend != begin; --ltend)
+ if (!pred(*(ltend - 1), *eqbeg))
+ {
+ if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg);
+ else break;
+ }
+
+ // scanned all elements
+ if (gtbeg == end && ltend == begin)
+ {
+ *out_eqbeg = eqbeg;
+ *out_eqend = eqend;
+ return;
+ }
+
+ // make room for elements by moving equal area
+ if (gtbeg == end)
+ {
+ if (--ltend != --eqbeg) swap(*ltend, *eqbeg);
+ swap(*eqbeg, *--eqend);
+ }
+ else if (ltend == begin)
+ {
+ if (eqend != gtbeg) swap(*eqbeg, *eqend);
+ ++eqend;
+ swap(*gtbeg++, *eqbeg++);
+ }
+ else swap(*gtbeg++, *--ltend);
+ }
+ }
+
+ template <typename I, typename Pred> void median3(I first, I middle, I last, const Pred& pred)
+ {
+ if (pred(*middle, *first)) swap(*middle, *first);
+ if (pred(*last, *middle)) swap(*last, *middle);
+ if (pred(*middle, *first)) swap(*middle, *first);
+ }
+
+ template <typename I, typename Pred> void median(I first, I middle, I last, const Pred& pred)
+ {
+ if (last - first <= 40)
+ {
+ // median of three for small chunks
+ median3(first, middle, last, pred);
+ }
+ else
+ {
+ // median of nine
+ size_t step = (last - first + 1) / 8;
+
+ median3(first, first + step, first + 2 * step, pred);
+ median3(middle - step, middle, middle + step, pred);
+ median3(last - 2 * step, last - step, last, pred);
+ median3(first + step, middle, last - step, pred);
+ }
+ }
+
+ template <typename I, typename Pred> void sort(I begin, I end, const Pred& pred)
+ {
+ // sort large chunks
+ while (end - begin > 32)
+ {
+ // find median element
+ I middle = begin + (end - begin) / 2;
+ median(begin, middle, end - 1, pred);
+
+ // partition in three chunks (< = >)
+ I eqbeg, eqend;
+ partition(begin, middle, end, pred, &eqbeg, &eqend);
+
+ // loop on larger half
+ if (eqbeg - begin > end - eqend)
+ {
+ sort(eqend, end, pred);
+ end = eqbeg;
+ }
+ else
+ {
+ sort(begin, eqbeg, pred);
+ begin = eqend;
+ }
+ }
+
+ // insertion sort small chunk
+ if (begin != end) insertion_sort(begin, end, pred, &*begin);
+ }
+PUGI__NS_END
+
+// Allocator used for AST and evaluation stacks
+PUGI__NS_BEGIN
+ struct xpath_memory_block
+ {
+ xpath_memory_block* next;
+
+ char data[
+ #ifdef PUGIXML_MEMORY_XPATH_PAGE_SIZE
+ PUGIXML_MEMORY_XPATH_PAGE_SIZE
+ #else
+ 4096
+ #endif
+ ];
+ };
+
+ class xpath_allocator
+ {
+ xpath_memory_block* _root;
+ size_t _root_size;
+
+ public:
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ jmp_buf* error_handler;
+ #endif
+
+ xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size)
+ {
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ error_handler = 0;
+ #endif
+ }
+
+ void* allocate_nothrow(size_t size)
+ {
+ const size_t block_capacity = sizeof(_root->data);
+
+ // align size so that we're able to store pointers in subsequent blocks
+ size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+ if (_root_size + size <= block_capacity)
+ {
+ void* buf = _root->data + _root_size;
+ _root_size += size;
+ return buf;
+ }
+ else
+ {
+ size_t block_data_size = (size > block_capacity) ? size : block_capacity;
+ size_t block_size = block_data_size + offsetof(xpath_memory_block, data);
+
+ xpath_memory_block* block = static_cast<xpath_memory_block*>(xml_memory::allocate(block_size));
+ if (!block) return 0;
+
+ block->next = _root;
+
+ _root = block;
+ _root_size = size;
+
+ return block->data;
+ }
+ }
+
+ void* allocate(size_t size)
+ {
+ void* result = allocate_nothrow(size);
+
+ if (!result)
+ {
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ assert(error_handler);
+ longjmp(*error_handler, 1);
+ #else
+ throw std::bad_alloc();
+ #endif
+ }
+
+ return result;
+ }
+
+ void* reallocate(void* ptr, size_t old_size, size_t new_size)
+ {
+ // align size so that we're able to store pointers in subsequent blocks
+ old_size = (old_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+ new_size = (new_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+ // we can only reallocate the last object
+ assert(ptr == 0 || static_cast<char*>(ptr) + old_size == _root->data + _root_size);
+
+ // adjust root size so that we have not allocated the object at all
+ bool only_object = (_root_size == old_size);
+
+ if (ptr) _root_size -= old_size;
+
+ // allocate a new version (this will obviously reuse the memory if possible)
+ void* result = allocate(new_size);
+ assert(result);
+
+ // we have a new block
+ if (result != ptr && ptr)
+ {
+ // copy old data
+ assert(new_size > old_size);
+ memcpy(result, ptr, old_size);
+
+ // free the previous page if it had no other objects
+ if (only_object)
+ {
+ assert(_root->data == result);
+ assert(_root->next);
+
+ xpath_memory_block* next = _root->next->next;
+
+ if (next)
+ {
+ // deallocate the whole page, unless it was the first one
+ xml_memory::deallocate(_root->next);
+ _root->next = next;
+ }
+ }
+ }
+
+ return result;
+ }
+
+ void revert(const xpath_allocator& state)
+ {
+ // free all new pages
+ xpath_memory_block* cur = _root;
+
+ while (cur != state._root)
+ {
+ xpath_memory_block* next = cur->next;
+
+ xml_memory::deallocate(cur);
+
+ cur = next;
+ }
+
+ // restore state
+ _root = state._root;
+ _root_size = state._root_size;
+ }
+
+ void release()
+ {
+ xpath_memory_block* cur = _root;
+ assert(cur);
+
+ while (cur->next)
+ {
+ xpath_memory_block* next = cur->next;
+
+ xml_memory::deallocate(cur);
+
+ cur = next;
+ }
+ }
+ };
+
+ struct xpath_allocator_capture
+ {
+ xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc)
+ {
+ }
+
+ ~xpath_allocator_capture()
+ {
+ _target->revert(_state);
+ }
+
+ xpath_allocator* _target;
+ xpath_allocator _state;
+ };
+
+ struct xpath_stack
+ {
+ xpath_allocator* result;
+ xpath_allocator* temp;
+ };
+
+ struct xpath_stack_data
+ {
+ xpath_memory_block blocks[2];
+ xpath_allocator result;
+ xpath_allocator temp;
+ xpath_stack stack;
+
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ jmp_buf error_handler;
+ #endif
+
+ xpath_stack_data(): result(blocks + 0), temp(blocks + 1)
+ {
+ blocks[0].next = blocks[1].next = 0;
+
+ stack.result = &result;
+ stack.temp = &temp;
+
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ result.error_handler = temp.error_handler = &error_handler;
+ #endif
+ }
+
+ ~xpath_stack_data()
+ {
+ result.release();
+ temp.release();
+ }
+ };
+PUGI__NS_END
+
+// String class
+PUGI__NS_BEGIN
+ class xpath_string
+ {
+ const char_t* _buffer;
+ bool _uses_heap;
+
+ static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc)
+ {
+ char_t* result = static_cast<char_t*>(alloc->allocate((length + 1) * sizeof(char_t)));
+ assert(result);
+
+ memcpy(result, string, length * sizeof(char_t));
+ result[length] = 0;
+
+ return result;
+ }
+
+ static char_t* duplicate_string(const char_t* string, xpath_allocator* alloc)
+ {
+ return duplicate_string(string, strlength(string), alloc);
+ }
+
+ public:
+ xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false)
+ {
+ }
+
+ explicit xpath_string(const char_t* str, xpath_allocator* alloc)
+ {
+ bool empty_ = (*str == 0);
+
+ _buffer = empty_ ? PUGIXML_TEXT("") : duplicate_string(str, alloc);
+ _uses_heap = !empty_;
+ }
+
+ explicit xpath_string(const char_t* str, bool use_heap): _buffer(str), _uses_heap(use_heap)
+ {
+ }
+
+ xpath_string(const char_t* begin, const char_t* end, xpath_allocator* alloc)
+ {
+ assert(begin <= end);
+
+ bool empty_ = (begin == end);
+
+ _buffer = empty_ ? PUGIXML_TEXT("") : duplicate_string(begin, static_cast<size_t>(end - begin), alloc);
+ _uses_heap = !empty_;
+ }
+
+ void append(const xpath_string& o, xpath_allocator* alloc)
+ {
+ // skip empty sources
+ if (!*o._buffer) return;
+
+ // fast append for constant empty target and constant source
+ if (!*_buffer && !_uses_heap && !o._uses_heap)
+ {
+ _buffer = o._buffer;
+ }
+ else
+ {
+ // need to make heap copy
+ size_t target_length = strlength(_buffer);
+ size_t source_length = strlength(o._buffer);
+ size_t result_length = target_length + source_length;
+
+ // allocate new buffer
+ char_t* result = static_cast<char_t*>(alloc->reallocate(_uses_heap ? const_cast<char_t*>(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t)));
+ assert(result);
+
+ // append first string to the new buffer in case there was no reallocation
+ if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t));
+
+ // append second string to the new buffer
+ memcpy(result + target_length, o._buffer, source_length * sizeof(char_t));
+ result[result_length] = 0;
+
+ // finalize
+ _buffer = result;
+ _uses_heap = true;
+ }
+ }
+
+ const char_t* c_str() const
+ {
+ return _buffer;
+ }
+
+ size_t length() const
+ {
+ return strlength(_buffer);
+ }
+
+ char_t* data(xpath_allocator* alloc)
+ {
+ // make private heap copy
+ if (!_uses_heap)
+ {
+ _buffer = duplicate_string(_buffer, alloc);
+ _uses_heap = true;
+ }
+
+ return const_cast<char_t*>(_buffer);
+ }
+
+ bool empty() const
+ {
+ return *_buffer == 0;
+ }
+
+ bool operator==(const xpath_string& o) const
+ {
+ return strequal(_buffer, o._buffer);
+ }
+
+ bool operator!=(const xpath_string& o) const
+ {
+ return !strequal(_buffer, o._buffer);
+ }
+
+ bool uses_heap() const
+ {
+ return _uses_heap;
+ }
+ };
+
+ PUGI__FN xpath_string xpath_string_const(const char_t* str)
+ {
+ return xpath_string(str, false);
+ }
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+ PUGI__FN bool starts_with(const char_t* string, const char_t* pattern)
+ {
+ while (*pattern && *string == *pattern)
+ {
+ string++;
+ pattern++;
+ }
+
+ return *pattern == 0;
+ }
+
+ PUGI__FN const char_t* find_char(const char_t* s, char_t c)
+ {
+ #ifdef PUGIXML_WCHAR_MODE
+ return wcschr(s, c);
+ #else
+ return strchr(s, c);
+ #endif
+ }
+
+ PUGI__FN const char_t* find_substring(const char_t* s, const char_t* p)
+ {
+ #ifdef PUGIXML_WCHAR_MODE
+ // MSVC6 wcsstr bug workaround (if s is empty it always returns 0)
+ return (*p == 0) ? s : wcsstr(s, p);
+ #else
+ return strstr(s, p);
+ #endif
+ }
+
+ // Converts symbol to lower case, if it is an ASCII one
+ PUGI__FN char_t tolower_ascii(char_t ch)
+ {
+ return static_cast<unsigned int>(ch - 'A') < 26 ? static_cast<char_t>(ch | ' ') : ch;
+ }
+
+ PUGI__FN xpath_string string_value(const xpath_node& na, xpath_allocator* alloc)
+ {
+ if (na.attribute())
+ return xpath_string_const(na.attribute().value());
+ else
+ {
+ const xml_node& n = na.node();
+
+ switch (n.type())
+ {
+ case node_pcdata:
+ case node_cdata:
+ case node_comment:
+ case node_pi:
+ return xpath_string_const(n.value());
+
+ case node_document:
+ case node_element:
+ {
+ xpath_string result;
+
+ xml_node cur = n.first_child();
+
+ while (cur && cur != n)
+ {
+ if (cur.type() == node_pcdata || cur.type() == node_cdata)
+ result.append(xpath_string_const(cur.value()), alloc);
+
+ if (cur.first_child())
+ cur = cur.first_child();
+ else if (cur.next_sibling())
+ cur = cur.next_sibling();
+ else
+ {
+ while (!cur.next_sibling() && cur != n)
+ cur = cur.parent();
+
+ if (cur != n) cur = cur.next_sibling();
+ }
+ }
+
+ return result;
+ }
+
+ default:
+ return xpath_string();
+ }
+ }
+ }
+
+ PUGI__FN unsigned int node_height(xml_node n)
+ {
+ unsigned int result = 0;
+
+ while (n)
+ {
+ ++result;
+ n = n.parent();
+ }
+
+ return result;
+ }
+
+ PUGI__FN bool node_is_before(xml_node ln, unsigned int lh, xml_node rn, unsigned int rh)
+ {
+ // normalize heights
+ for (unsigned int i = rh; i < lh; i++) ln = ln.parent();
+ for (unsigned int j = lh; j < rh; j++) rn = rn.parent();
+
+ // one node is the ancestor of the other
+ if (ln == rn) return lh < rh;
+
+ // find common ancestor
+ while (ln.parent() != rn.parent())
+ {
+ ln = ln.parent();
+ rn = rn.parent();
+ }
+
+ // there is no common ancestor (the shared parent is null), nodes are from different documents
+ if (!ln.parent()) return ln < rn;
+
+ // determine sibling order
+ for (; ln; ln = ln.next_sibling())
+ if (ln == rn)
+ return true;
+
+ return false;
+ }
+
+ PUGI__FN bool node_is_ancestor(xml_node parent, xml_node node)
+ {
+ while (node && node != parent) node = node.parent();
+
+ return parent && node == parent;
+ }
+
+ PUGI__FN const void* document_order(const xpath_node& xnode)
+ {
+ xml_node_struct* node = xnode.node().internal_object();
+
+ if (node)
+ {
+ if (node->name && (node->header & xml_memory_page_name_allocated_mask) == 0) return node->name;
+ if (node->value && (node->header & xml_memory_page_value_allocated_mask) == 0) return node->value;
+ return 0;
+ }
+
+ xml_attribute_struct* attr = xnode.attribute().internal_object();
+
+ if (attr)
+ {
+ if ((attr->header & xml_memory_page_name_allocated_mask) == 0) return attr->name;
+ if ((attr->header & xml_memory_page_value_allocated_mask) == 0) return attr->value;
+ return 0;
+ }
+
+ return 0;
+ }
+
+ struct document_order_comparator
+ {
+ bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+ {
+ // optimized document order based check
+ const void* lo = document_order(lhs);
+ const void* ro = document_order(rhs);
+
+ if (lo && ro) return lo < ro;
+
+ // slow comparison
+ xml_node ln = lhs.node(), rn = rhs.node();
+
+ // compare attributes
+ if (lhs.attribute() && rhs.attribute())
+ {
+ // shared parent
+ if (lhs.parent() == rhs.parent())
+ {
+ // determine sibling order
+ for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute())
+ if (a == rhs.attribute())
+ return true;
+
+ return false;
+ }
+
+ // compare attribute parents
+ ln = lhs.parent();
+ rn = rhs.parent();
+ }
+ else if (lhs.attribute())
+ {
+ // attributes go after the parent element
+ if (lhs.parent() == rhs.node()) return false;
+
+ ln = lhs.parent();
+ }
+ else if (rhs.attribute())
+ {
+ // attributes go after the parent element
+ if (rhs.parent() == lhs.node()) return true;
+
+ rn = rhs.parent();
+ }
+
+ if (ln == rn) return false;
+
+ unsigned int lh = node_height(ln);
+ unsigned int rh = node_height(rn);
+
+ return node_is_before(ln, lh, rn, rh);
+ }
+ };
+
+ struct duplicate_comparator
+ {
+ bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+ {
+ if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true;
+ else return rhs.attribute() ? false : lhs.node() < rhs.node();
+ }
+ };
+
+ PUGI__FN double gen_nan()
+ {
+ #if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24))
+ union { float f; uint32_t i; } u[sizeof(float) == sizeof(uint32_t) ? 1 : -1];
+ u[0].i = 0x7fc00000;
+ return u[0].f;
+ #else
+ // fallback
+ const volatile double zero = 0.0;
+ return zero / zero;
+ #endif
+ }
+
+ PUGI__FN bool is_nan(double value)
+ {
+ #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+ return !!_isnan(value);
+ #elif defined(fpclassify) && defined(FP_NAN)
+ return fpclassify(value) == FP_NAN;
+ #else
+ // fallback
+ const volatile double v = value;
+ return v != v;
+ #endif
+ }
+
+ PUGI__FN const char_t* convert_number_to_string_special(double value)
+ {
+ #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+ if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0;
+ if (_isnan(value)) return PUGIXML_TEXT("NaN");
+ return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+ #elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO)
+ switch (fpclassify(value))
+ {
+ case FP_NAN:
+ return PUGIXML_TEXT("NaN");
+
+ case FP_INFINITE:
+ return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+
+ case FP_ZERO:
+ return PUGIXML_TEXT("0");
+
+ default:
+ return 0;
+ }
+ #else
+ // fallback
+ const volatile double v = value;
+
+ if (v == 0) return PUGIXML_TEXT("0");
+ if (v != v) return PUGIXML_TEXT("NaN");
+ if (v * 2 == v) return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+ return 0;
+ #endif
+ }
+
+ PUGI__FN bool convert_number_to_boolean(double value)
+ {
+ return (value != 0 && !is_nan(value));
+ }
+
+ PUGI__FN void truncate_zeros(char* begin, char* end)
+ {
+ while (begin != end && end[-1] == '0') end--;
+
+ *end = 0;
+ }
+
+ // gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent
+#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+ PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+ {
+ // get base values
+ int sign, exponent;
+ _ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign);
+
+ // truncate redundant zeros
+ truncate_zeros(buffer, buffer + strlen(buffer));
+
+ // fill results
+ *out_mantissa = buffer;
+ *out_exponent = exponent;
+ }
+#else
+ PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+ {
+ // get a scientific notation value with IEEE DBL_DIG decimals
+ sprintf(buffer, "%.*e", DBL_DIG, value);
+ assert(strlen(buffer) < buffer_size);
+ (void)!buffer_size;
+
+ // get the exponent (possibly negative)
+ char* exponent_string = strchr(buffer, 'e');
+ assert(exponent_string);
+
+ int exponent = atoi(exponent_string + 1);
+
+ // extract mantissa string: skip sign
+ char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer;
+ assert(mantissa[0] != '0' && mantissa[1] == '.');
+
+ // divide mantissa by 10 to eliminate integer part
+ mantissa[1] = mantissa[0];
+ mantissa++;
+ exponent++;
+
+ // remove extra mantissa digits and zero-terminate mantissa
+ truncate_zeros(mantissa, exponent_string);
+
+ // fill results
+ *out_mantissa = mantissa;
+ *out_exponent = exponent;
+ }
+#endif
+
+ PUGI__FN xpath_string convert_number_to_string(double value, xpath_allocator* alloc)
+ {
+ // try special number conversion
+ const char_t* special = convert_number_to_string_special(value);
+ if (special) return xpath_string_const(special);
+
+ // get mantissa + exponent form
+ char mantissa_buffer[64];
+
+ char* mantissa;
+ int exponent;
+ convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent);
+
+ // make the number!
+ char_t result[512];
+ char_t* s = result;
+
+ // sign
+ if (value < 0) *s++ = '-';
+
+ // integer part
+ if (exponent <= 0)
+ {
+ *s++ = '0';
+ }
+ else
+ {
+ while (exponent > 0)
+ {
+ assert(*mantissa == 0 || static_cast<unsigned int>(*mantissa - '0') <= 9);
+ *s++ = *mantissa ? *mantissa++ : '0';
+ exponent--;
+ }
+ }
+
+ // fractional part
+ if (*mantissa)
+ {
+ // decimal point
+ *s++ = '.';
+
+ // extra zeroes from negative exponent
+ while (exponent < 0)
+ {
+ *s++ = '0';
+ exponent++;
+ }
+
+ // extra mantissa digits
+ while (*mantissa)
+ {
+ assert(static_cast<unsigned int>(*mantissa - '0') <= 9);
+ *s++ = *mantissa++;
+ }
+ }
+
+ // zero-terminate
+ assert(s < result + sizeof(result) / sizeof(result[0]));
+ *s = 0;
+
+ return xpath_string(result, alloc);
+ }
+
+ PUGI__FN bool check_string_to_number_format(const char_t* string)
+ {
+ // parse leading whitespace
+ while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+ // parse sign
+ if (*string == '-') ++string;
+
+ if (!*string) return false;
+
+ // if there is no integer part, there should be a decimal part with at least one digit
+ if (!PUGI__IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !PUGI__IS_CHARTYPEX(string[1], ctx_digit))) return false;
+
+ // parse integer part
+ while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+
+ // parse decimal part
+ if (*string == '.')
+ {
+ ++string;
+
+ while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+ }
+
+ // parse trailing whitespace
+ while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+ return *string == 0;
+ }
+
+ PUGI__FN double convert_string_to_number(const char_t* string)
+ {
+ // check string format
+ if (!check_string_to_number_format(string)) return gen_nan();
+
+ // parse string
+ #ifdef PUGIXML_WCHAR_MODE
+ return wcstod(string, 0);
+ #else
+ return atof(string);
+ #endif
+ }
+
+ PUGI__FN bool convert_string_to_number(const char_t* begin, const char_t* end, double* out_result)
+ {
+ char_t buffer[32];
+
+ size_t length = static_cast<size_t>(end - begin);
+ char_t* scratch = buffer;
+
+ if (length >= sizeof(buffer) / sizeof(buffer[0]))
+ {
+ // need to make dummy on-heap copy
+ scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+ if (!scratch) return false;
+ }
+
+ // copy string to zero-terminated buffer and perform conversion
+ memcpy(scratch, begin, length * sizeof(char_t));
+ scratch[length] = 0;
+
+ *out_result = convert_string_to_number(scratch);
+
+ // free dummy buffer
+ if (scratch != buffer) xml_memory::deallocate(scratch);
+
+ return true;
+ }
+
+ PUGI__FN double round_nearest(double value)
+ {
+ return floor(value + 0.5);
+ }
+
+ PUGI__FN double round_nearest_nzero(double value)
+ {
+ // same as round_nearest, but returns -0 for [-0.5, -0]
+ // ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0)
+ return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5);
+ }
+
+ PUGI__FN const char_t* qualified_name(const xpath_node& node)
+ {
+ return node.attribute() ? node.attribute().name() : node.node().name();
+ }
+
+ PUGI__FN const char_t* local_name(const xpath_node& node)
+ {
+ const char_t* name = qualified_name(node);
+ const char_t* p = find_char(name, ':');
+
+ return p ? p + 1 : name;
+ }
+
+ struct namespace_uri_predicate
+ {
+ const char_t* prefix;
+ size_t prefix_length;
+
+ namespace_uri_predicate(const char_t* name)
+ {
+ const char_t* pos = find_char(name, ':');
+
+ prefix = pos ? name : 0;
+ prefix_length = pos ? static_cast<size_t>(pos - name) : 0;
+ }
+
+ bool operator()(const xml_attribute& a) const
+ {
+ const char_t* name = a.name();
+
+ if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false;
+
+ return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0;
+ }
+ };
+
+ PUGI__FN const char_t* namespace_uri(const xml_node& node)
+ {
+ namespace_uri_predicate pred = node.name();
+
+ xml_node p = node;
+
+ while (p)
+ {
+ xml_attribute a = p.find_attribute(pred);
+
+ if (a) return a.value();
+
+ p = p.parent();
+ }
+
+ return PUGIXML_TEXT("");
+ }
+
+ PUGI__FN const char_t* namespace_uri(const xml_attribute& attr, const xml_node& parent)
+ {
+ namespace_uri_predicate pred = attr.name();
+
+ // Default namespace does not apply to attributes
+ if (!pred.prefix) return PUGIXML_TEXT("");
+
+ xml_node p = parent;
+
+ while (p)
+ {
+ xml_attribute a = p.find_attribute(pred);
+
+ if (a) return a.value();
+
+ p = p.parent();
+ }
+
+ return PUGIXML_TEXT("");
+ }
+
+ PUGI__FN const char_t* namespace_uri(const xpath_node& node)
+ {
+ return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node());
+ }
+
+ PUGI__FN void normalize_space(char_t* buffer)
+ {
+ char_t* write = buffer;
+
+ for (char_t* it = buffer; *it; )
+ {
+ char_t ch = *it++;
+
+ if (PUGI__IS_CHARTYPE(ch, ct_space))
+ {
+ // replace whitespace sequence with single space
+ while (PUGI__IS_CHARTYPE(*it, ct_space)) it++;
+
+ // avoid leading spaces
+ if (write != buffer) *write++ = ' ';
+ }
+ else *write++ = ch;
+ }
+
+ // remove trailing space
+ if (write != buffer && PUGI__IS_CHARTYPE(write[-1], ct_space)) write--;
+
+ // zero-terminate
+ *write = 0;
+ }
+
+ PUGI__FN void translate(char_t* buffer, const char_t* from, const char_t* to)
+ {
+ size_t to_length = strlength(to);
+
+ char_t* write = buffer;
+
+ while (*buffer)
+ {
+ PUGI__DMC_VOLATILE char_t ch = *buffer++;
+
+ const char_t* pos = find_char(from, ch);
+
+ if (!pos)
+ *write++ = ch; // do not process
+ else if (static_cast<size_t>(pos - from) < to_length)
+ *write++ = to[pos - from]; // replace
+ }
+
+ // zero-terminate
+ *write = 0;
+ }
+
+ struct xpath_variable_boolean: xpath_variable
+ {
+ xpath_variable_boolean(): value(false)
+ {
+ }
+
+ bool value;
+ char_t name[1];
+ };
+
+ struct xpath_variable_number: xpath_variable
+ {
+ xpath_variable_number(): value(0)
+ {
+ }
+
+ double value;
+ char_t name[1];
+ };
+
+ struct xpath_variable_string: xpath_variable
+ {
+ xpath_variable_string(): value(0)
+ {
+ }
+
+ ~xpath_variable_string()
+ {
+ if (value) xml_memory::deallocate(value);
+ }
+
+ char_t* value;
+ char_t name[1];
+ };
+
+ struct xpath_variable_node_set: xpath_variable
+ {
+ xpath_node_set value;
+ char_t name[1];
+ };
+
+ static const xpath_node_set dummy_node_set;
+
+ PUGI__FN unsigned int hash_string(const char_t* str)
+ {
+ // Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time)
+ unsigned int result = 0;
+
+ while (*str)
+ {
+ result += static_cast<unsigned int>(*str++);
+ result += result << 10;
+ result ^= result >> 6;
+ }
+
+ result += result << 3;
+ result ^= result >> 11;
+ result += result << 15;
+
+ return result;
+ }
+
+ template <typename T> PUGI__FN T* new_xpath_variable(const char_t* name)
+ {
+ size_t length = strlength(name);
+ if (length == 0) return 0; // empty variable names are invalid
+
+ // $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters
+ void* memory = xml_memory::allocate(sizeof(T) + length * sizeof(char_t));
+ if (!memory) return 0;
+
+ T* result = new (memory) T();
+
+ memcpy(result->name, name, (length + 1) * sizeof(char_t));
+
+ return result;
+ }
+
+ PUGI__FN xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name)
+ {
+ switch (type)
+ {
+ case xpath_type_node_set:
+ return new_xpath_variable<xpath_variable_node_set>(name);
+
+ case xpath_type_number:
+ return new_xpath_variable<xpath_variable_number>(name);
+
+ case xpath_type_string:
+ return new_xpath_variable<xpath_variable_string>(name);
+
+ case xpath_type_boolean:
+ return new_xpath_variable<xpath_variable_boolean>(name);
+
+ default:
+ return 0;
+ }
+ }
+
+ template <typename T> PUGI__FN void delete_xpath_variable(T* var)
+ {
+ var->~T();
+ xml_memory::deallocate(var);
+ }
+
+ PUGI__FN void delete_xpath_variable(xpath_value_type type, xpath_variable* var)
+ {
+ switch (type)
+ {
+ case xpath_type_node_set:
+ delete_xpath_variable(static_cast<xpath_variable_node_set*>(var));
+ break;
+
+ case xpath_type_number:
+ delete_xpath_variable(static_cast<xpath_variable_number*>(var));
+ break;
+
+ case xpath_type_string:
+ delete_xpath_variable(static_cast<xpath_variable_string*>(var));
+ break;
+
+ case xpath_type_boolean:
+ delete_xpath_variable(static_cast<xpath_variable_boolean*>(var));
+ break;
+
+ default:
+ assert(!"Invalid variable type");
+ }
+ }
+
+ PUGI__FN xpath_variable* get_variable(xpath_variable_set* set, const char_t* begin, const char_t* end)
+ {
+ char_t buffer[32];
+
+ size_t length = static_cast<size_t>(end - begin);
+ char_t* scratch = buffer;
+
+ if (length >= sizeof(buffer) / sizeof(buffer[0]))
+ {
+ // need to make dummy on-heap copy
+ scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+ if (!scratch) return 0;
+ }
+
+ // copy string to zero-terminated buffer and perform lookup
+ memcpy(scratch, begin, length * sizeof(char_t));
+ scratch[length] = 0;
+
+ xpath_variable* result = set->get(scratch);
+
+ // free dummy buffer
+ if (scratch != buffer) xml_memory::deallocate(scratch);
+
+ return result;
+ }
+PUGI__NS_END
+
+// Internal node set class
+PUGI__NS_BEGIN
+ PUGI__FN xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev)
+ {
+ xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
+
+ if (type == xpath_node_set::type_unsorted)
+ {
+ sort(begin, end, document_order_comparator());
+
+ type = xpath_node_set::type_sorted;
+ }
+
+ if (type != order) reverse(begin, end);
+
+ return order;
+ }
+
+ PUGI__FN xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type)
+ {
+ if (begin == end) return xpath_node();
+
+ switch (type)
+ {
+ case xpath_node_set::type_sorted:
+ return *begin;
+
+ case xpath_node_set::type_sorted_reverse:
+ return *(end - 1);
+
+ case xpath_node_set::type_unsorted:
+ return *min_element(begin, end, document_order_comparator());
+
+ default:
+ assert(!"Invalid node set type");
+ return xpath_node();
+ }
+ }
+
+ class xpath_node_set_raw
+ {
+ xpath_node_set::type_t _type;
+
+ xpath_node* _begin;
+ xpath_node* _end;
+ xpath_node* _eos;
+
+ public:
+ xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0)
+ {
+ }
+
+ xpath_node* begin() const
+ {
+ return _begin;
+ }
+
+ xpath_node* end() const
+ {
+ return _end;
+ }
+
+ bool empty() const
+ {
+ return _begin == _end;
+ }
+
+ size_t size() const
+ {
+ return static_cast<size_t>(_end - _begin);
+ }
+
+ xpath_node first() const
+ {
+ return xpath_first(_begin, _end, _type);
+ }
+
+ void push_back(const xpath_node& node, xpath_allocator* alloc)
+ {
+ if (_end == _eos)
+ {
+ size_t capacity = static_cast<size_t>(_eos - _begin);
+
+ // get new capacity (1.5x rule)
+ size_t new_capacity = capacity + capacity / 2 + 1;
+
+ // reallocate the old array or allocate a new one
+ xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node)));
+ assert(data);
+
+ // finalize
+ _begin = data;
+ _end = data + capacity;
+ _eos = data + new_capacity;
+ }
+
+ *_end++ = node;
+ }
+
+ void append(const xpath_node* begin_, const xpath_node* end_, xpath_allocator* alloc)
+ {
+ size_t size_ = static_cast<size_t>(_end - _begin);
+ size_t capacity = static_cast<size_t>(_eos - _begin);
+ size_t count = static_cast<size_t>(end_ - begin_);
+
+ if (size_ + count > capacity)
+ {
+ // reallocate the old array or allocate a new one
+ xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node)));
+ assert(data);
+
+ // finalize
+ _begin = data;
+ _end = data + size_;
+ _eos = data + size_ + count;
+ }
+
+ memcpy(_end, begin_, count * sizeof(xpath_node));
+ _end += count;
+ }
+
+ void sort_do()
+ {
+ _type = xpath_sort(_begin, _end, _type, false);
+ }
+
+ void truncate(xpath_node* pos)
+ {
+ assert(_begin <= pos && pos <= _end);
+
+ _end = pos;
+ }
+
+ void remove_duplicates()
+ {
+ if (_type == xpath_node_set::type_unsorted)
+ sort(_begin, _end, duplicate_comparator());
+
+ _end = unique(_begin, _end);
+ }
+
+ xpath_node_set::type_t type() const
+ {
+ return _type;
+ }
+
+ void set_type(xpath_node_set::type_t value)
+ {
+ _type = value;
+ }
+ };
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+ struct xpath_context
+ {
+ xpath_node n;
+ size_t position, size;
+
+ xpath_context(const xpath_node& n_, size_t position_, size_t size_): n(n_), position(position_), size(size_)
+ {
+ }
+ };
+
+ enum lexeme_t
+ {
+ lex_none = 0,
+ lex_equal,
+ lex_not_equal,
+ lex_less,
+ lex_greater,
+ lex_less_or_equal,
+ lex_greater_or_equal,
+ lex_plus,
+ lex_minus,
+ lex_multiply,
+ lex_union,
+ lex_var_ref,
+ lex_open_brace,
+ lex_close_brace,
+ lex_quoted_string,
+ lex_number,
+ lex_slash,
+ lex_double_slash,
+ lex_open_square_brace,
+ lex_close_square_brace,
+ lex_string,
+ lex_comma,
+ lex_axis_attribute,
+ lex_dot,
+ lex_double_dot,
+ lex_double_colon,
+ lex_eof
+ };
+
+ struct xpath_lexer_string
+ {
+ const char_t* begin;
+ const char_t* end;
+
+ xpath_lexer_string(): begin(0), end(0)
+ {
+ }
+
+ bool operator==(const char_t* other) const
+ {
+ size_t length = static_cast<size_t>(end - begin);
+
+ return strequalrange(other, begin, length);
+ }
+ };
+
+ class xpath_lexer
+ {
+ const char_t* _cur;
+ const char_t* _cur_lexeme_pos;
+ xpath_lexer_string _cur_lexeme_contents;
+
+ lexeme_t _cur_lexeme;
+
+ public:
+ explicit xpath_lexer(const char_t* query): _cur(query)
+ {
+ next();
+ }
+
+ const char_t* state() const
+ {
+ return _cur;
+ }
+
+ void next()
+ {
+ const char_t* cur = _cur;
+
+ while (PUGI__IS_CHARTYPE(*cur, ct_space)) ++cur;
+
+ // save lexeme position for error reporting
+ _cur_lexeme_pos = cur;
+
+ switch (*cur)
+ {
+ case 0:
+ _cur_lexeme = lex_eof;
+ break;
+
+ case '>':
+ if (*(cur+1) == '=')
+ {
+ cur += 2;
+ _cur_lexeme = lex_greater_or_equal;
+ }
+ else
+ {
+ cur += 1;
+ _cur_lexeme = lex_greater;
+ }
+ break;
+
+ case '<':
+ if (*(cur+1) == '=')
+ {
+ cur += 2;
+ _cur_lexeme = lex_less_or_equal;
+ }
+ else
+ {
+ cur += 1;
+ _cur_lexeme = lex_less;
+ }
+ break;
+
+ case '!':
+ if (*(cur+1) == '=')
+ {
+ cur += 2;
+ _cur_lexeme = lex_not_equal;
+ }
+ else
+ {
+ _cur_lexeme = lex_none;
+ }
+ break;
+
+ case '=':
+ cur += 1;
+ _cur_lexeme = lex_equal;
+
+ break;
+
+ case '+':
+ cur += 1;
+ _cur_lexeme = lex_plus;
+
+ break;
+
+ case '-':
+ cur += 1;
+ _cur_lexeme = lex_minus;
+
+ break;
+
+ case '*':
+ cur += 1;
+ _cur_lexeme = lex_multiply;
+
+ break;
+
+ case '|':
+ cur += 1;
+ _cur_lexeme = lex_union;
+
+ break;
+
+ case '$':
+ cur += 1;
+
+ if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+ {
+ _cur_lexeme_contents.begin = cur;
+
+ while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+ if (cur[0] == ':' && PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // qname
+ {
+ cur++; // :
+
+ while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+ }
+
+ _cur_lexeme_contents.end = cur;
+
+ _cur_lexeme = lex_var_ref;
+ }
+ else
+ {
+ _cur_lexeme = lex_none;
+ }
+
+ break;
+
+ case '(':
+ cur += 1;
+ _cur_lexeme = lex_open_brace;
+
+ break;
+
+ case ')':
+ cur += 1;
+ _cur_lexeme = lex_close_brace;
+
+ break;
+
+ case '[':
+ cur += 1;
+ _cur_lexeme = lex_open_square_brace;
+
+ break;
+
+ case ']':
+ cur += 1;
+ _cur_lexeme = lex_close_square_brace;
+
+ break;
+
+ case ',':
+ cur += 1;
+ _cur_lexeme = lex_comma;
+
+ break;
+
+ case '/':
+ if (*(cur+1) == '/')
+ {
+ cur += 2;
+ _cur_lexeme = lex_double_slash;
+ }
+ else
+ {
+ cur += 1;
+ _cur_lexeme = lex_slash;
+ }
+ break;
+
+ case '.':
+ if (*(cur+1) == '.')
+ {
+ cur += 2;
+ _cur_lexeme = lex_double_dot;
+ }
+ else if (PUGI__IS_CHARTYPEX(*(cur+1), ctx_digit))
+ {
+ _cur_lexeme_contents.begin = cur; // .
+
+ ++cur;
+
+ while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+
+ _cur_lexeme_contents.end = cur;
+
+ _cur_lexeme = lex_number;
+ }
+ else
+ {
+ cur += 1;
+ _cur_lexeme = lex_dot;
+ }
+ break;
+
+ case '@':
+ cur += 1;
+ _cur_lexeme = lex_axis_attribute;
+
+ break;
+
+ case '"':
+ case '\'':
+ {
+ char_t terminator = *cur;
+
+ ++cur;
+
+ _cur_lexeme_contents.begin = cur;
+ while (*cur && *cur != terminator) cur++;
+ _cur_lexeme_contents.end = cur;
+
+ if (!*cur)
+ _cur_lexeme = lex_none;
+ else
+ {
+ cur += 1;
+ _cur_lexeme = lex_quoted_string;
+ }
+
+ break;
+ }
+
+ case ':':
+ if (*(cur+1) == ':')
+ {
+ cur += 2;
+ _cur_lexeme = lex_double_colon;
+ }
+ else
+ {
+ _cur_lexeme = lex_none;
+ }
+ break;
+
+ default:
+ if (PUGI__IS_CHARTYPEX(*cur, ctx_digit))
+ {
+ _cur_lexeme_contents.begin = cur;
+
+ while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+
+ if (*cur == '.')
+ {
+ cur++;
+
+ while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+ }
+
+ _cur_lexeme_contents.end = cur;
+
+ _cur_lexeme = lex_number;
+ }
+ else if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+ {
+ _cur_lexeme_contents.begin = cur;
+
+ while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+ if (cur[0] == ':')
+ {
+ if (cur[1] == '*') // namespace test ncname:*
+ {
+ cur += 2; // :*
+ }
+ else if (PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // namespace test qname
+ {
+ cur++; // :
+
+ while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+ }
+ }
+
+ _cur_lexeme_contents.end = cur;
+
+ _cur_lexeme = lex_string;
+ }
+ else
+ {
+ _cur_lexeme = lex_none;
+ }
+ }
+
+ _cur = cur;
+ }
+
+ lexeme_t current() const
+ {
+ return _cur_lexeme;
+ }
+
+ const char_t* current_pos() const
+ {
+ return _cur_lexeme_pos;
+ }
+
+ const xpath_lexer_string& contents() const
+ {
+ assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string);
+
+ return _cur_lexeme_contents;
+ }
+ };
+
+ enum ast_type_t
+ {
+ ast_op_or, // left or right
+ ast_op_and, // left and right
+ ast_op_equal, // left = right
+ ast_op_not_equal, // left != right
+ ast_op_less, // left < right
+ ast_op_greater, // left > right
+ ast_op_less_or_equal, // left <= right
+ ast_op_greater_or_equal, // left >= right
+ ast_op_add, // left + right
+ ast_op_subtract, // left - right
+ ast_op_multiply, // left * right
+ ast_op_divide, // left / right
+ ast_op_mod, // left % right
+ ast_op_negate, // left - right
+ ast_op_union, // left | right
+ ast_predicate, // apply predicate to set; next points to next predicate
+ ast_filter, // select * from left where right
+ ast_filter_posinv, // select * from left where right; proximity position invariant
+ ast_string_constant, // string constant
+ ast_number_constant, // number constant
+ ast_variable, // variable
+ ast_func_last, // last()
+ ast_func_position, // position()
+ ast_func_count, // count(left)
+ ast_func_id, // id(left)
+ ast_func_local_name_0, // local-name()
+ ast_func_local_name_1, // local-name(left)
+ ast_func_namespace_uri_0, // namespace-uri()
+ ast_func_namespace_uri_1, // namespace-uri(left)
+ ast_func_name_0, // name()
+ ast_func_name_1, // name(left)
+ ast_func_string_0, // string()
+ ast_func_string_1, // string(left)
+ ast_func_concat, // concat(left, right, siblings)
+ ast_func_starts_with, // starts_with(left, right)
+ ast_func_contains, // contains(left, right)
+ ast_func_substring_before, // substring-before(left, right)
+ ast_func_substring_after, // substring-after(left, right)
+ ast_func_substring_2, // substring(left, right)
+ ast_func_substring_3, // substring(left, right, third)
+ ast_func_string_length_0, // string-length()
+ ast_func_string_length_1, // string-length(left)
+ ast_func_normalize_space_0, // normalize-space()
+ ast_func_normalize_space_1, // normalize-space(left)
+ ast_func_translate, // translate(left, right, third)
+ ast_func_boolean, // boolean(left)
+ ast_func_not, // not(left)
+ ast_func_true, // true()
+ ast_func_false, // false()
+ ast_func_lang, // lang(left)
+ ast_func_number_0, // number()
+ ast_func_number_1, // number(left)
+ ast_func_sum, // sum(left)
+ ast_func_floor, // floor(left)
+ ast_func_ceiling, // ceiling(left)
+ ast_func_round, // round(left)
+ ast_step, // process set left with step
+ ast_step_root // select root node
+ };
+
+ enum axis_t
+ {
+ axis_ancestor,
+ axis_ancestor_or_self,
+ axis_attribute,
+ axis_child,
+ axis_descendant,
+ axis_descendant_or_self,
+ axis_following,
+ axis_following_sibling,
+ axis_namespace,
+ axis_parent,
+ axis_preceding,
+ axis_preceding_sibling,
+ axis_self
+ };
+
+ enum nodetest_t
+ {
+ nodetest_none,
+ nodetest_name,
+ nodetest_type_node,
+ nodetest_type_comment,
+ nodetest_type_pi,
+ nodetest_type_text,
+ nodetest_pi,
+ nodetest_all,
+ nodetest_all_in_namespace
+ };
+
+ template <axis_t N> struct axis_to_type
+ {
+ static const axis_t axis;
+ };
+
+ template <axis_t N> const axis_t axis_to_type<N>::axis = N;
+
+ class xpath_ast_node
+ {
+ private:
+ // node type
+ char _type;
+ char _rettype;
+
+ // for ast_step / ast_predicate
+ char _axis;
+ char _test;
+
+ // tree node structure
+ xpath_ast_node* _left;
+ xpath_ast_node* _right;
+ xpath_ast_node* _next;
+
+ union
+ {
+ // value for ast_string_constant
+ const char_t* string;
+ // value for ast_number_constant
+ double number;
+ // variable for ast_variable
+ xpath_variable* variable;
+ // node test for ast_step (node name/namespace/node type/pi target)
+ const char_t* nodetest;
+ } _data;
+
+ xpath_ast_node(const xpath_ast_node&);
+ xpath_ast_node& operator=(const xpath_ast_node&);
+
+ template <class Comp> static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+ {
+ xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+ if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+ {
+ if (lt == xpath_type_boolean || rt == xpath_type_boolean)
+ return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+ else if (lt == xpath_type_number || rt == xpath_type_number)
+ return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+ else if (lt == xpath_type_string || rt == xpath_type_string)
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_string ls = lhs->eval_string(c, stack);
+ xpath_string rs = rhs->eval_string(c, stack);
+
+ return comp(ls, rs);
+ }
+ }
+ else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
+ xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+ for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+ for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+ {
+ xpath_allocator_capture cri(stack.result);
+
+ if (comp(string_value(*li, stack.result), string_value(*ri, stack.result)))
+ return true;
+ }
+
+ return false;
+ }
+ else
+ {
+ if (lt == xpath_type_node_set)
+ {
+ swap(lhs, rhs);
+ swap(lt, rt);
+ }
+
+ if (lt == xpath_type_boolean)
+ return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+ else if (lt == xpath_type_number)
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ double l = lhs->eval_number(c, stack);
+ xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+ for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+ {
+ xpath_allocator_capture cri(stack.result);
+
+ if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+ return true;
+ }
+
+ return false;
+ }
+ else if (lt == xpath_type_string)
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_string l = lhs->eval_string(c, stack);
+ xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+ for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+ {
+ xpath_allocator_capture cri(stack.result);
+
+ if (comp(l, string_value(*ri, stack.result)))
+ return true;
+ }
+
+ return false;
+ }
+ }
+
+ assert(!"Wrong types");
+ return false;
+ }
+
+ template <class Comp> static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+ {
+ xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+ if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+ return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+ else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
+ xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+ for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+ {
+ xpath_allocator_capture cri(stack.result);
+
+ double l = convert_string_to_number(string_value(*li, stack.result).c_str());
+
+ for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+ {
+ xpath_allocator_capture crii(stack.result);
+
+ if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+ return true;
+ }
+ }
+
+ return false;
+ }
+ else if (lt != xpath_type_node_set && rt == xpath_type_node_set)
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ double l = lhs->eval_number(c, stack);
+ xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+ for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+ {
+ xpath_allocator_capture cri(stack.result);
+
+ if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+ return true;
+ }
+
+ return false;
+ }
+ else if (lt == xpath_type_node_set && rt != xpath_type_node_set)
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
+ double r = rhs->eval_number(c, stack);
+
+ for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+ {
+ xpath_allocator_capture cri(stack.result);
+
+ if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r))
+ return true;
+ }
+
+ return false;
+ }
+ else
+ {
+ assert(!"Wrong types");
+ return false;
+ }
+ }
+
+ void apply_predicate(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack)
+ {
+ assert(ns.size() >= first);
+
+ size_t i = 1;
+ size_t size = ns.size() - first;
+
+ xpath_node* last = ns.begin() + first;
+
+ // remove_if... or well, sort of
+ for (xpath_node* it = last; it != ns.end(); ++it, ++i)
+ {
+ xpath_context c(*it, i, size);
+
+ if (expr->rettype() == xpath_type_number)
+ {
+ if (expr->eval_number(c, stack) == i)
+ *last++ = *it;
+ }
+ else if (expr->eval_boolean(c, stack))
+ *last++ = *it;
+ }
+
+ ns.truncate(last);
+ }
+
+ void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack)
+ {
+ if (ns.size() == first) return;
+
+ for (xpath_ast_node* pred = _right; pred; pred = pred->_next)
+ {
+ apply_predicate(ns, first, pred->_left, stack);
+ }
+ }
+
+ void step_push(xpath_node_set_raw& ns, const xml_attribute& a, const xml_node& parent, xpath_allocator* alloc)
+ {
+ if (!a) return;
+
+ const char_t* name = a.name();
+
+ // There are no attribute nodes corresponding to attributes that declare namespaces
+ // That is, "xmlns:..." or "xmlns"
+ if (starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':')) return;
+
+ switch (_test)
+ {
+ case nodetest_name:
+ if (strequal(name, _data.nodetest)) ns.push_back(xpath_node(a, parent), alloc);
+ break;
+
+ case nodetest_type_node:
+ case nodetest_all:
+ ns.push_back(xpath_node(a, parent), alloc);
+ break;
+
+ case nodetest_all_in_namespace:
+ if (starts_with(name, _data.nodetest))
+ ns.push_back(xpath_node(a, parent), alloc);
+ break;
+
+ default:
+ ;
+ }
+ }
+
+ void step_push(xpath_node_set_raw& ns, const xml_node& n, xpath_allocator* alloc)
+ {
+ if (!n) return;
+
+ switch (_test)
+ {
+ case nodetest_name:
+ if (n.type() == node_element && strequal(n.name(), _data.nodetest)) ns.push_back(n, alloc);
+ break;
+
+ case nodetest_type_node:
+ ns.push_back(n, alloc);
+ break;
+
+ case nodetest_type_comment:
+ if (n.type() == node_comment)
+ ns.push_back(n, alloc);
+ break;
+
+ case nodetest_type_text:
+ if (n.type() == node_pcdata || n.type() == node_cdata)
+ ns.push_back(n, alloc);
+ break;
+
+ case nodetest_type_pi:
+ if (n.type() == node_pi)
+ ns.push_back(n, alloc);
+ break;
+
+ case nodetest_pi:
+ if (n.type() == node_pi && strequal(n.name(), _data.nodetest))
+ ns.push_back(n, alloc);
+ break;
+
+ case nodetest_all:
+ if (n.type() == node_element)
+ ns.push_back(n, alloc);
+ break;
+
+ case nodetest_all_in_namespace:
+ if (n.type() == node_element && starts_with(n.name(), _data.nodetest))
+ ns.push_back(n, alloc);
+ break;
+
+ default:
+ assert(!"Unknown axis");
+ }
+ }
+
+ template <class T> void step_fill(xpath_node_set_raw& ns, const xml_node& n, xpath_allocator* alloc, T)
+ {
+ const axis_t axis = T::axis;
+
+ switch (axis)
+ {
+ case axis_attribute:
+ {
+ for (xml_attribute a = n.first_attribute(); a; a = a.next_attribute())
+ step_push(ns, a, n, alloc);
+
+ break;
+ }
+
+ case axis_child:
+ {
+ for (xml_node c = n.first_child(); c; c = c.next_sibling())
+ step_push(ns, c, alloc);
+
+ break;
+ }
+
+ case axis_descendant:
+ case axis_descendant_or_self:
+ {
+ if (axis == axis_descendant_or_self)
+ step_push(ns, n, alloc);
+
+ xml_node cur = n.first_child();
+
+ while (cur && cur != n)
+ {
+ step_push(ns, cur, alloc);
+
+ if (cur.first_child())
+ cur = cur.first_child();
+ else if (cur.next_sibling())
+ cur = cur.next_sibling();
+ else
+ {
+ while (!cur.next_sibling() && cur != n)
+ cur = cur.parent();
+
+ if (cur != n) cur = cur.next_sibling();
+ }
+ }
+
+ break;
+ }
+
+ case axis_following_sibling:
+ {
+ for (xml_node c = n.next_sibling(); c; c = c.next_sibling())
+ step_push(ns, c, alloc);
+
+ break;
+ }
+
+ case axis_preceding_sibling:
+ {
+ for (xml_node c = n.previous_sibling(); c; c = c.previous_sibling())
+ step_push(ns, c, alloc);
+
+ break;
+ }
+
+ case axis_following:
+ {
+ xml_node cur = n;
+
+ // exit from this node so that we don't include descendants
+ while (cur && !cur.next_sibling()) cur = cur.parent();
+ cur = cur.next_sibling();
+
+ for (;;)
+ {
+ step_push(ns, cur, alloc);
+
+ if (cur.first_child())
+ cur = cur.first_child();
+ else if (cur.next_sibling())
+ cur = cur.next_sibling();
+ else
+ {
+ while (cur && !cur.next_sibling()) cur = cur.parent();
+ cur = cur.next_sibling();
+
+ if (!cur) break;
+ }
+ }
+
+ break;
+ }
+
+ case axis_preceding:
+ {
+ xml_node cur = n;
+
+ while (cur && !cur.previous_sibling()) cur = cur.parent();
+ cur = cur.previous_sibling();
+
+ for (;;)
+ {
+ if (cur.last_child())
+ cur = cur.last_child();
+ else
+ {
+ // leaf node, can't be ancestor
+ step_push(ns, cur, alloc);
+
+ if (cur.previous_sibling())
+ cur = cur.previous_sibling();
+ else
+ {
+ do
+ {
+ cur = cur.parent();
+ if (!cur) break;
+
+ if (!node_is_ancestor(cur, n)) step_push(ns, cur, alloc);
+ }
+ while (!cur.previous_sibling());
+
+ cur = cur.previous_sibling();
+
+ if (!cur) break;
+ }
+ }
+ }
+
+ break;
+ }
+
+ case axis_ancestor:
+ case axis_ancestor_or_self:
+ {
+ if (axis == axis_ancestor_or_self)
+ step_push(ns, n, alloc);
+
+ xml_node cur = n.parent();
+
+ while (cur)
+ {
+ step_push(ns, cur, alloc);
+
+ cur = cur.parent();
+ }
+
+ break;
+ }
+
+ case axis_self:
+ {
+ step_push(ns, n, alloc);
+
+ break;
+ }
+
+ case axis_parent:
+ {
+ if (n.parent()) step_push(ns, n.parent(), alloc);
+
+ break;
+ }
+
+ default:
+ assert(!"Unimplemented axis");
+ }
+ }
+
+ template <class T> void step_fill(xpath_node_set_raw& ns, const xml_attribute& a, const xml_node& p, xpath_allocator* alloc, T v)
+ {
+ const axis_t axis = T::axis;
+
+ switch (axis)
+ {
+ case axis_ancestor:
+ case axis_ancestor_or_self:
+ {
+ if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test
+ step_push(ns, a, p, alloc);
+
+ xml_node cur = p;
+
+ while (cur)
+ {
+ step_push(ns, cur, alloc);
+
+ cur = cur.parent();
+ }
+
+ break;
+ }
+
+ case axis_descendant_or_self:
+ case axis_self:
+ {
+ if (_test == nodetest_type_node) // reject attributes based on principal node type test
+ step_push(ns, a, p, alloc);
+
+ break;
+ }
+
+ case axis_following:
+ {
+ xml_node cur = p;
+
+ for (;;)
+ {
+ if (cur.first_child())
+ cur = cur.first_child();
+ else if (cur.next_sibling())
+ cur = cur.next_sibling();
+ else
+ {
+ while (cur && !cur.next_sibling()) cur = cur.parent();
+ cur = cur.next_sibling();
+
+ if (!cur) break;
+ }
+
+ step_push(ns, cur, alloc);
+ }
+
+ break;
+ }
+
+ case axis_parent:
+ {
+ step_push(ns, p, alloc);
+
+ break;
+ }
+
+ case axis_preceding:
+ {
+ // preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding
+ step_fill(ns, p, alloc, v);
+ break;
+ }
+
+ default:
+ assert(!"Unimplemented axis");
+ }
+ }
+
+ template <class T> xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, T v)
+ {
+ const axis_t axis = T::axis;
+ bool attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self);
+
+ xpath_node_set_raw ns;
+ ns.set_type((axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling) ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted);
+
+ if (_left)
+ {
+ xpath_node_set_raw s = _left->eval_node_set(c, stack);
+
+ // self axis preserves the original order
+ if (axis == axis_self) ns.set_type(s.type());
+
+ for (const xpath_node* it = s.begin(); it != s.end(); ++it)
+ {
+ size_t size = ns.size();
+
+ // in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes
+ if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted);
+
+ if (it->node())
+ step_fill(ns, it->node(), stack.result, v);
+ else if (attributes)
+ step_fill(ns, it->attribute(), it->parent(), stack.result, v);
+
+ apply_predicates(ns, size, stack);
+ }
+ }
+ else
+ {
+ if (c.n.node())
+ step_fill(ns, c.n.node(), stack.result, v);
+ else if (attributes)
+ step_fill(ns, c.n.attribute(), c.n.parent(), stack.result, v);
+
+ apply_predicates(ns, 0, stack);
+ }
+
+ // child, attribute and self axes always generate unique set of nodes
+ // for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice
+ if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted)
+ ns.remove_duplicates();
+
+ return ns;
+ }
+
+ public:
+ xpath_ast_node(ast_type_t type, xpath_value_type rettype_, const char_t* value):
+ _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+ {
+ assert(type == ast_string_constant);
+ _data.string = value;
+ }
+
+ xpath_ast_node(ast_type_t type, xpath_value_type rettype_, double value):
+ _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+ {
+ assert(type == ast_number_constant);
+ _data.number = value;
+ }
+
+ xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_variable* value):
+ _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+ {
+ assert(type == ast_variable);
+ _data.variable = value;
+ }
+
+ xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_ast_node* left = 0, xpath_ast_node* right = 0):
+ _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(left), _right(right), _next(0)
+ {
+ }
+
+ xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents):
+ _type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(static_cast<char>(axis)), _test(static_cast<char>(test)), _left(left), _right(0), _next(0)
+ {
+ _data.nodetest = contents;
+ }
+
+ void set_next(xpath_ast_node* value)
+ {
+ _next = value;
+ }
+
+ void set_right(xpath_ast_node* value)
+ {
+ _right = value;
+ }
+
+ bool eval_boolean(const xpath_context& c, const xpath_stack& stack)
+ {
+ switch (_type)
+ {
+ case ast_op_or:
+ return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack);
+
+ case ast_op_and:
+ return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack);
+
+ case ast_op_equal:
+ return compare_eq(_left, _right, c, stack, equal_to());
+
+ case ast_op_not_equal:
+ return compare_eq(_left, _right, c, stack, not_equal_to());
+
+ case ast_op_less:
+ return compare_rel(_left, _right, c, stack, less());
+
+ case ast_op_greater:
+ return compare_rel(_right, _left, c, stack, less());
+
+ case ast_op_less_or_equal:
+ return compare_rel(_left, _right, c, stack, less_equal());
+
+ case ast_op_greater_or_equal:
+ return compare_rel(_right, _left, c, stack, less_equal());
+
+ case ast_func_starts_with:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_string lr = _left->eval_string(c, stack);
+ xpath_string rr = _right->eval_string(c, stack);
+
+ return starts_with(lr.c_str(), rr.c_str());
+ }
+
+ case ast_func_contains:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_string lr = _left->eval_string(c, stack);
+ xpath_string rr = _right->eval_string(c, stack);
+
+ return find_substring(lr.c_str(), rr.c_str()) != 0;
+ }
+
+ case ast_func_boolean:
+ return _left->eval_boolean(c, stack);
+
+ case ast_func_not:
+ return !_left->eval_boolean(c, stack);
+
+ case ast_func_true:
+ return true;
+
+ case ast_func_false:
+ return false;
+
+ case ast_func_lang:
+ {
+ if (c.n.attribute()) return false;
+
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_string lang = _left->eval_string(c, stack);
+
+ for (xml_node n = c.n.node(); n; n = n.parent())
+ {
+ xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang"));
+
+ if (a)
+ {
+ const char_t* value = a.value();
+
+ // strnicmp / strncasecmp is not portable
+ for (const char_t* lit = lang.c_str(); *lit; ++lit)
+ {
+ if (tolower_ascii(*lit) != tolower_ascii(*value)) return false;
+ ++value;
+ }
+
+ return *value == 0 || *value == '-';
+ }
+ }
+
+ return false;
+ }
+
+ case ast_variable:
+ {
+ assert(_rettype == _data.variable->type());
+
+ if (_rettype == xpath_type_boolean)
+ return _data.variable->get_boolean();
+
+ // fallthrough to type conversion
+ }
+
+ default:
+ {
+ switch (_rettype)
+ {
+ case xpath_type_number:
+ return convert_number_to_boolean(eval_number(c, stack));
+
+ case xpath_type_string:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ return !eval_string(c, stack).empty();
+ }
+
+ case xpath_type_node_set:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ return !eval_node_set(c, stack).empty();
+ }
+
+ default:
+ assert(!"Wrong expression for return type boolean");
+ return false;
+ }
+ }
+ }
+ }
+
+ double eval_number(const xpath_context& c, const xpath_stack& stack)
+ {
+ switch (_type)
+ {
+ case ast_op_add:
+ return _left->eval_number(c, stack) + _right->eval_number(c, stack);
+
+ case ast_op_subtract:
+ return _left->eval_number(c, stack) - _right->eval_number(c, stack);
+
+ case ast_op_multiply:
+ return _left->eval_number(c, stack) * _right->eval_number(c, stack);
+
+ case ast_op_divide:
+ return _left->eval_number(c, stack) / _right->eval_number(c, stack);
+
+ case ast_op_mod:
+ return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack));
+
+ case ast_op_negate:
+ return -_left->eval_number(c, stack);
+
+ case ast_number_constant:
+ return _data.number;
+
+ case ast_func_last:
+ return static_cast<double>(c.size);
+
+ case ast_func_position:
+ return static_cast<double>(c.position);
+
+ case ast_func_count:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ return static_cast<double>(_left->eval_node_set(c, stack).size());
+ }
+
+ case ast_func_string_length_0:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ return static_cast<double>(string_value(c.n, stack.result).length());
+ }
+
+ case ast_func_string_length_1:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ return static_cast<double>(_left->eval_string(c, stack).length());
+ }
+
+ case ast_func_number_0:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ return convert_string_to_number(string_value(c.n, stack.result).c_str());
+ }
+
+ case ast_func_number_1:
+ return _left->eval_number(c, stack);
+
+ case ast_func_sum:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ double r = 0;
+
+ xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+
+ for (const xpath_node* it = ns.begin(); it != ns.end(); ++it)
+ {
+ xpath_allocator_capture cri(stack.result);
+
+ r += convert_string_to_number(string_value(*it, stack.result).c_str());
+ }
+
+ return r;
+ }
+
+ case ast_func_floor:
+ {
+ double r = _left->eval_number(c, stack);
+
+ return r == r ? floor(r) : r;
+ }
+
+ case ast_func_ceiling:
+ {
+ double r = _left->eval_number(c, stack);
+
+ return r == r ? ceil(r) : r;
+ }
+
+ case ast_func_round:
+ return round_nearest_nzero(_left->eval_number(c, stack));
+
+ case ast_variable:
+ {
+ assert(_rettype == _data.variable->type());
+
+ if (_rettype == xpath_type_number)
+ return _data.variable->get_number();
+
+ // fallthrough to type conversion
+ }
+
+ default:
+ {
+ switch (_rettype)
+ {
+ case xpath_type_boolean:
+ return eval_boolean(c, stack) ? 1 : 0;
+
+ case xpath_type_string:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ return convert_string_to_number(eval_string(c, stack).c_str());
+ }
+
+ case xpath_type_node_set:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ return convert_string_to_number(eval_string(c, stack).c_str());
+ }
+
+ default:
+ assert(!"Wrong expression for return type number");
+ return 0;
+ }
+
+ }
+ }
+ }
+
+ xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack)
+ {
+ assert(_type == ast_func_concat);
+
+ xpath_allocator_capture ct(stack.temp);
+
+ // count the string number
+ size_t count = 1;
+ for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++;
+
+ // gather all strings
+ xpath_string static_buffer[4];
+ xpath_string* buffer = static_buffer;
+
+ // allocate on-heap for large concats
+ if (count > sizeof(static_buffer) / sizeof(static_buffer[0]))
+ {
+ buffer = static_cast<xpath_string*>(stack.temp->allocate(count * sizeof(xpath_string)));
+ assert(buffer);
+ }
+
+ // evaluate all strings to temporary stack
+ xpath_stack swapped_stack = {stack.temp, stack.result};
+
+ buffer[0] = _left->eval_string(c, swapped_stack);
+
+ size_t pos = 1;
+ for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack);
+ assert(pos == count);
+
+ // get total length
+ size_t length = 0;
+ for (size_t i = 0; i < count; ++i) length += buffer[i].length();
+
+ // create final string
+ char_t* result = static_cast<char_t*>(stack.result->allocate((length + 1) * sizeof(char_t)));
+ assert(result);
+
+ char_t* ri = result;
+
+ for (size_t j = 0; j < count; ++j)
+ for (const char_t* bi = buffer[j].c_str(); *bi; ++bi)
+ *ri++ = *bi;
+
+ *ri = 0;
+
+ return xpath_string(result, true);
+ }
+
+ xpath_string eval_string(const xpath_context& c, const xpath_stack& stack)
+ {
+ switch (_type)
+ {
+ case ast_string_constant:
+ return xpath_string_const(_data.string);
+
+ case ast_func_local_name_0:
+ {
+ xpath_node na = c.n;
+
+ return xpath_string_const(local_name(na));
+ }
+
+ case ast_func_local_name_1:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+ xpath_node na = ns.first();
+
+ return xpath_string_const(local_name(na));
+ }
+
+ case ast_func_name_0:
+ {
+ xpath_node na = c.n;
+
+ return xpath_string_const(qualified_name(na));
+ }
+
+ case ast_func_name_1:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+ xpath_node na = ns.first();
+
+ return xpath_string_const(qualified_name(na));
+ }
+
+ case ast_func_namespace_uri_0:
+ {
+ xpath_node na = c.n;
+
+ return xpath_string_const(namespace_uri(na));
+ }
+
+ case ast_func_namespace_uri_1:
+ {
+ xpath_allocator_capture cr(stack.result);
+
+ xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+ xpath_node na = ns.first();
+
+ return xpath_string_const(namespace_uri(na));
+ }
+
+ case ast_func_string_0:
+ return string_value(c.n, stack.result);
+
+ case ast_func_string_1:
+ return _left->eval_string(c, stack);
+
+ case ast_func_concat:
+ return eval_string_concat(c, stack);
+
+ case ast_func_substring_before:
+ {
+ xpath_allocator_capture cr(stack.temp);
+
+ xpath_stack swapped_stack = {stack.temp, stack.result};
+
+ xpath_string s = _left->eval_string(c, swapped_stack);
+ xpath_string p = _right->eval_string(c, swapped_stack);
+
+ const char_t* pos = find_substring(s.c_str(), p.c_str());
+
+ return pos ? xpath_string(s.c_str(), pos, stack.result) : xpath_string();
+ }
+
+ case ast_func_substring_after:
+ {
+ xpath_allocator_capture cr(stack.temp);
+
+ xpath_stack swapped_stack = {stack.temp, stack.result};
+
+ xpath_string s = _left->eval_string(c, swapped_stack);
+ xpath_string p = _right->eval_string(c, swapped_stack);
+
+ const char_t* pos = find_substring(s.c_str(), p.c_str());
+ if (!pos) return xpath_string();
+
+ const char_t* result = pos + p.length();
+
+ return s.uses_heap() ? xpath_string(result, stack.result) : xpath_string_const(result);
+ }
+
+ case ast_func_substring_2:
+ {
+ xpath_allocator_capture cr(stack.temp);
+
+ xpath_stack swapped_stack = {stack.temp, stack.result};
+
+ xpath_string s = _left->eval_string(c, swapped_stack);
+ size_t s_length = s.length();
+
+ double first = round_nearest(_right->eval_number(c, stack));
+
+ if (is_nan(first)) return xpath_string(); // NaN
+ else if (first >= s_length + 1) return xpath_string();
+
+ size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+ assert(1 <= pos && pos <= s_length + 1);
+
+ const char_t* rbegin = s.c_str() + (pos - 1);
+
+ return s.uses_heap() ? xpath_string(rbegin, stack.result) : xpath_string_const(rbegin);
+ }
+
+ case ast_func_substring_3:
+ {
+ xpath_allocator_capture cr(stack.temp);
+
+ xpath_stack swapped_stack = {stack.temp, stack.result};
+
+ xpath_string s = _left->eval_string(c, swapped_stack);
+ size_t s_length = s.length();
+
+ double first = round_nearest(_right->eval_number(c, stack));
+ double last = first + round_nearest(_right->_next->eval_number(c, stack));
+
+ if (is_nan(first) || is_nan(last)) return xpath_string();
+ else if (first >= s_length + 1) return xpath_string();
+ else if (first >= last) return xpath_string();
+ else if (last < 1) return xpath_string();
+
+ size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+ size_t end = last >= s_length + 1 ? s_length + 1 : static_cast<size_t>(last);
+
+ assert(1 <= pos && pos <= end && end <= s_length + 1);
+ const char_t* rbegin = s.c_str() + (pos - 1);
+ const char_t* rend = s.c_str() + (end - 1);
+
+ return (end == s_length + 1 && !s.uses_heap()) ? xpath_string_const(rbegin) : xpath_string(rbegin, rend, stack.result);
+ }
+
+ case ast_func_normalize_space_0:
+ {
+ xpath_string s = string_value(c.n, stack.result);
+
+ normalize_space(s.data(stack.result));
+
+ return s;
+ }
+
+ case ast_func_normalize_space_1:
+ {
+ xpath_string s = _left->eval_string(c, stack);
+
+ normalize_space(s.data(stack.result));
+
+ return s;
+ }
+
+ case ast_func_translate:
+ {
+ xpath_allocator_capture cr(stack.temp);
+
+ xpath_stack swapped_stack = {stack.temp, stack.result};
+
+ xpath_string s = _left->eval_string(c, stack);
+ xpath_string from = _right->eval_string(c, swapped_stack);
+ xpath_string to = _right->_next->eval_string(c, swapped_stack);
+
+ translate(s.data(stack.result), from.c_str(), to.c_str());
+
+ return s;
+ }
+
+ case ast_variable:
+ {
+ assert(_rettype == _data.variable->type());
+
+ if (_rettype == xpath_type_string)
+ return xpath_string_const(_data.variable->get_string());
+
+ // fallthrough to type conversion
+ }
+
+ default:
+ {
+ switch (_rettype)
+ {
+ case xpath_type_boolean:
+ return xpath_string_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+
+ case xpath_type_number:
+ return convert_number_to_string(eval_number(c, stack), stack.result);
+
+ case xpath_type_node_set:
+ {
+ xpath_allocator_capture cr(stack.temp);
+
+ xpath_stack swapped_stack = {stack.temp, stack.result};
+
+ xpath_node_set_raw ns = eval_node_set(c, swapped_stack);
+ return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result);
+ }
+
+ default:
+ assert(!"Wrong expression for return type string");
+ return xpath_string();
+ }
+ }
+ }
+ }
+
+ xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack)
+ {
+ switch (_type)
+ {
+ case ast_op_union:
+ {
+ xpath_allocator_capture cr(stack.temp);
+
+ xpath_stack swapped_stack = {stack.temp, stack.result};
+
+ xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack);
+ xpath_node_set_raw rs = _right->eval_node_set(c, stack);
+
+ // we can optimize merging two sorted sets, but this is a very rare operation, so don't bother
+ rs.set_type(xpath_node_set::type_unsorted);
+
+ rs.append(ls.begin(), ls.end(), stack.result);
+ rs.remove_duplicates();
+
+ return rs;
+ }
+
+ case ast_filter:
+ case ast_filter_posinv:
+ {
+ xpath_node_set_raw set = _left->eval_node_set(c, stack);
+
+ // either expression is a number or it contains position() call; sort by document order
+ if (_type == ast_filter) set.sort_do();
+
+ apply_predicate(set, 0, _right, stack);
+
+ return set;
+ }
+
+ case ast_func_id:
+ return xpath_node_set_raw();
+
+ case ast_step:
+ {
+ switch (_axis)
+ {
+ case axis_ancestor:
+ return step_do(c, stack, axis_to_type<axis_ancestor>());
+
+ case axis_ancestor_or_self:
+ return step_do(c, stack, axis_to_type<axis_ancestor_or_self>());
+
+ case axis_attribute:
+ return step_do(c, stack, axis_to_type<axis_attribute>());
+
+ case axis_child:
+ return step_do(c, stack, axis_to_type<axis_child>());
+
+ case axis_descendant:
+ return step_do(c, stack, axis_to_type<axis_descendant>());
+
+ case axis_descendant_or_self:
+ return step_do(c, stack, axis_to_type<axis_descendant_or_self>());
+
+ case axis_following:
+ return step_do(c, stack, axis_to_type<axis_following>());
+
+ case axis_following_sibling:
+ return step_do(c, stack, axis_to_type<axis_following_sibling>());
+
+ case axis_namespace:
+ // namespaced axis is not supported
+ return xpath_node_set_raw();
+
+ case axis_parent:
+ return step_do(c, stack, axis_to_type<axis_parent>());
+
+ case axis_preceding:
+ return step_do(c, stack, axis_to_type<axis_preceding>());
+
+ case axis_preceding_sibling:
+ return step_do(c, stack, axis_to_type<axis_preceding_sibling>());
+
+ case axis_self:
+ return step_do(c, stack, axis_to_type<axis_self>());
+
+ default:
+ assert(!"Unknown axis");
+ return xpath_node_set_raw();
+ }
+ }
+
+ case ast_step_root:
+ {
+ assert(!_right); // root step can't have any predicates
+
+ xpath_node_set_raw ns;
+
+ ns.set_type(xpath_node_set::type_sorted);
+
+ if (c.n.node()) ns.push_back(c.n.node().root(), stack.result);
+ else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result);
+
+ return ns;
+ }
+
+ case ast_variable:
+ {
+ assert(_rettype == _data.variable->type());
+
+ if (_rettype == xpath_type_node_set)
+ {
+ const xpath_node_set& s = _data.variable->get_node_set();
+
+ xpath_node_set_raw ns;
+
+ ns.set_type(s.type());
+ ns.append(s.begin(), s.end(), stack.result);
+
+ return ns;
+ }
+
+ // fallthrough to type conversion
+ }
+
+ default:
+ assert(!"Wrong expression for return type node set");
+ return xpath_node_set_raw();
+ }
+ }
+
+ bool is_posinv()
+ {
+ switch (_type)
+ {
+ case ast_func_position:
+ return false;
+
+ case ast_string_constant:
+ case ast_number_constant:
+ case ast_variable:
+ return true;
+
+ case ast_step:
+ case ast_step_root:
+ return true;
+
+ case ast_predicate:
+ case ast_filter:
+ case ast_filter_posinv:
+ return true;
+
+ default:
+ if (_left && !_left->is_posinv()) return false;
+
+ for (xpath_ast_node* n = _right; n; n = n->_next)
+ if (!n->is_posinv()) return false;
+
+ return true;
+ }
+ }
+
+ xpath_value_type rettype() const
+ {
+ return static_cast<xpath_value_type>(_rettype);
+ }
+ };
+
+ struct xpath_parser
+ {
+ xpath_allocator* _alloc;
+ xpath_lexer _lexer;
+
+ const char_t* _query;
+ xpath_variable_set* _variables;
+
+ xpath_parse_result* _result;
+
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ jmp_buf _error_handler;
+ #endif
+
+ void throw_error(const char* message)
+ {
+ _result->error = message;
+ _result->offset = _lexer.current_pos() - _query;
+
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ longjmp(_error_handler, 1);
+ #else
+ throw xpath_exception(*_result);
+ #endif
+ }
+
+ void throw_error_oom()
+ {
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ throw_error("Out of memory");
+ #else
+ throw std::bad_alloc();
+ #endif
+ }
+
+ void* alloc_node()
+ {
+ void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node));
+
+ if (!result) throw_error_oom();
+
+ return result;
+ }
+
+ const char_t* alloc_string(const xpath_lexer_string& value)
+ {
+ if (value.begin)
+ {
+ size_t length = static_cast<size_t>(value.end - value.begin);
+
+ char_t* c = static_cast<char_t*>(_alloc->allocate_nothrow((length + 1) * sizeof(char_t)));
+ if (!c) throw_error_oom();
+
+ memcpy(c, value.begin, length * sizeof(char_t));
+ c[length] = 0;
+
+ return c;
+ }
+ else return 0;
+ }
+
+ xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2])
+ {
+ assert(argc <= 1);
+
+ if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+
+ return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]);
+ }
+
+ xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2])
+ {
+ switch (name.begin[0])
+ {
+ case 'b':
+ if (name == PUGIXML_TEXT("boolean") && argc == 1)
+ return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]);
+
+ break;
+
+ case 'c':
+ if (name == PUGIXML_TEXT("count") && argc == 1)
+ {
+ if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+ return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]);
+ }
+ else if (name == PUGIXML_TEXT("contains") && argc == 2)
+ return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_string, args[0], args[1]);
+ else if (name == PUGIXML_TEXT("concat") && argc >= 2)
+ return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]);
+ else if (name == PUGIXML_TEXT("ceiling") && argc == 1)
+ return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]);
+
+ break;
+
+ case 'f':
+ if (name == PUGIXML_TEXT("false") && argc == 0)
+ return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean);
+ else if (name == PUGIXML_TEXT("floor") && argc == 1)
+ return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]);
+
+ break;
+
+ case 'i':
+ if (name == PUGIXML_TEXT("id") && argc == 1)
+ return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]);
+
+ break;
+
+ case 'l':
+ if (name == PUGIXML_TEXT("last") && argc == 0)
+ return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number);
+ else if (name == PUGIXML_TEXT("lang") && argc == 1)
+ return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]);
+ else if (name == PUGIXML_TEXT("local-name") && argc <= 1)
+ return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args);
+
+ break;
+
+ case 'n':
+ if (name == PUGIXML_TEXT("name") && argc <= 1)
+ return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args);
+ else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1)
+ return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args);
+ else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1)
+ return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]);
+ else if (name == PUGIXML_TEXT("not") && argc == 1)
+ return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]);
+ else if (name == PUGIXML_TEXT("number") && argc <= 1)
+ return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]);
+
+ break;
+
+ case 'p':
+ if (name == PUGIXML_TEXT("position") && argc == 0)
+ return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number);
+
+ break;
+
+ case 'r':
+ if (name == PUGIXML_TEXT("round") && argc == 1)
+ return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]);
+
+ break;
+
+ case 's':
+ if (name == PUGIXML_TEXT("string") && argc <= 1)
+ return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]);
+ else if (name == PUGIXML_TEXT("string-length") && argc <= 1)
+ return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_string, args[0]);
+ else if (name == PUGIXML_TEXT("starts-with") && argc == 2)
+ return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]);
+ else if (name == PUGIXML_TEXT("substring-before") && argc == 2)
+ return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]);
+ else if (name == PUGIXML_TEXT("substring-after") && argc == 2)
+ return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]);
+ else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3))
+ return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]);
+ else if (name == PUGIXML_TEXT("sum") && argc == 1)
+ {
+ if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+ return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]);
+ }
+
+ break;
+
+ case 't':
+ if (name == PUGIXML_TEXT("translate") && argc == 3)
+ return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]);
+ else if (name == PUGIXML_TEXT("true") && argc == 0)
+ return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean);
+
+ break;
+
+ default:
+ break;
+ }
+
+ throw_error("Unrecognized function or wrong parameter count");
+
+ return 0;
+ }
+
+ axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified)
+ {
+ specified = true;
+
+ switch (name.begin[0])
+ {
+ case 'a':
+ if (name == PUGIXML_TEXT("ancestor"))
+ return axis_ancestor;
+ else if (name == PUGIXML_TEXT("ancestor-or-self"))
+ return axis_ancestor_or_self;
+ else if (name == PUGIXML_TEXT("attribute"))
+ return axis_attribute;
+
+ break;
+
+ case 'c':
+ if (name == PUGIXML_TEXT("child"))
+ return axis_child;
+
+ break;
+
+ case 'd':
+ if (name == PUGIXML_TEXT("descendant"))
+ return axis_descendant;
+ else if (name == PUGIXML_TEXT("descendant-or-self"))
+ return axis_descendant_or_self;
+
+ break;
+
+ case 'f':
+ if (name == PUGIXML_TEXT("following"))
+ return axis_following;
+ else if (name == PUGIXML_TEXT("following-sibling"))
+ return axis_following_sibling;
+
+ break;
+
+ case 'n':
+ if (name == PUGIXML_TEXT("namespace"))
+ return axis_namespace;
+
+ break;
+
+ case 'p':
+ if (name == PUGIXML_TEXT("parent"))
+ return axis_parent;
+ else if (name == PUGIXML_TEXT("preceding"))
+ return axis_preceding;
+ else if (name == PUGIXML_TEXT("preceding-sibling"))
+ return axis_preceding_sibling;
+
+ break;
+
+ case 's':
+ if (name == PUGIXML_TEXT("self"))
+ return axis_self;
+
+ break;
+
+ default:
+ break;
+ }
+
+ specified = false;
+ return axis_child;
+ }
+
+ nodetest_t parse_node_test_type(const xpath_lexer_string& name)
+ {
+ switch (name.begin[0])
+ {
+ case 'c':
+ if (name == PUGIXML_TEXT("comment"))
+ return nodetest_type_comment;
+
+ break;
+
+ case 'n':
+ if (name == PUGIXML_TEXT("node"))
+ return nodetest_type_node;
+
+ break;
+
+ case 'p':
+ if (name == PUGIXML_TEXT("processing-instruction"))
+ return nodetest_type_pi;
+
+ break;
+
+ case 't':
+ if (name == PUGIXML_TEXT("text"))
+ return nodetest_type_text;
+
+ break;
+
+ default:
+ break;
+ }
+
+ return nodetest_none;
+ }
+
+ // PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
+ xpath_ast_node* parse_primary_expression()
+ {
+ switch (_lexer.current())
+ {
+ case lex_var_ref:
+ {
+ xpath_lexer_string name = _lexer.contents();
+
+ if (!_variables)
+ throw_error("Unknown variable: variable set is not provided");
+
+ xpath_variable* var = get_variable(_variables, name.begin, name.end);
+
+ if (!var)
+ throw_error("Unknown variable: variable set does not contain the given name");
+
+ _lexer.next();
+
+ return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var);
+ }
+
+ case lex_open_brace:
+ {
+ _lexer.next();
+
+ xpath_ast_node* n = parse_expression();
+
+ if (_lexer.current() != lex_close_brace)
+ throw_error("Unmatched braces");
+
+ _lexer.next();
+
+ return n;
+ }
+
+ case lex_quoted_string:
+ {
+ const char_t* value = alloc_string(_lexer.contents());
+
+ xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value);
+ _lexer.next();
+
+ return n;
+ }
+
+ case lex_number:
+ {
+ double value = 0;
+
+ if (!convert_string_to_number(_lexer.contents().begin, _lexer.contents().end, &value))
+ throw_error_oom();
+
+ xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value);
+ _lexer.next();
+
+ return n;
+ }
+
+ case lex_string:
+ {
+ xpath_ast_node* args[2] = {0};
+ size_t argc = 0;
+
+ xpath_lexer_string function = _lexer.contents();
+ _lexer.next();
+
+ xpath_ast_node* last_arg = 0;
+
+ if (_lexer.current() != lex_open_brace)
+ throw_error("Unrecognized function call");
+ _lexer.next();
+
+ if (_lexer.current() != lex_close_brace)
+ args[argc++] = parse_expression();
+
+ while (_lexer.current() != lex_close_brace)
+ {
+ if (_lexer.current() != lex_comma)
+ throw_error("No comma between function arguments");
+ _lexer.next();
+
+ xpath_ast_node* n = parse_expression();
+
+ if (argc < 2) args[argc] = n;
+ else last_arg->set_next(n);
+
+ argc++;
+ last_arg = n;
+ }
+
+ _lexer.next();
+
+ return parse_function(function, argc, args);
+ }
+
+ default:
+ throw_error("Unrecognizable primary expression");
+
+ return 0;
+ }
+ }
+
+ // FilterExpr ::= PrimaryExpr | FilterExpr Predicate
+ // Predicate ::= '[' PredicateExpr ']'
+ // PredicateExpr ::= Expr
+ xpath_ast_node* parse_filter_expression()
+ {
+ xpath_ast_node* n = parse_primary_expression();
+
+ while (_lexer.current() == lex_open_square_brace)
+ {
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_expression();
+
+ if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set");
+
+ bool posinv = expr->rettype() != xpath_type_number && expr->is_posinv();
+
+ n = new (alloc_node()) xpath_ast_node(posinv ? ast_filter_posinv : ast_filter, xpath_type_node_set, n, expr);
+
+ if (_lexer.current() != lex_close_square_brace)
+ throw_error("Unmatched square brace");
+
+ _lexer.next();
+ }
+
+ return n;
+ }
+
+ // Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep
+ // AxisSpecifier ::= AxisName '::' | '@'?
+ // NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')'
+ // NameTest ::= '*' | NCName ':' '*' | QName
+ // AbbreviatedStep ::= '.' | '..'
+ xpath_ast_node* parse_step(xpath_ast_node* set)
+ {
+ if (set && set->rettype() != xpath_type_node_set)
+ throw_error("Step has to be applied to node set");
+
+ bool axis_specified = false;
+ axis_t axis = axis_child; // implied child axis
+
+ if (_lexer.current() == lex_axis_attribute)
+ {
+ axis = axis_attribute;
+ axis_specified = true;
+
+ _lexer.next();
+ }
+ else if (_lexer.current() == lex_dot)
+ {
+ _lexer.next();
+
+ return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0);
+ }
+ else if (_lexer.current() == lex_double_dot)
+ {
+ _lexer.next();
+
+ return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0);
+ }
+
+ nodetest_t nt_type = nodetest_none;
+ xpath_lexer_string nt_name;
+
+ if (_lexer.current() == lex_string)
+ {
+ // node name test
+ nt_name = _lexer.contents();
+ _lexer.next();
+
+ // was it an axis name?
+ if (_lexer.current() == lex_double_colon)
+ {
+ // parse axis name
+ if (axis_specified) throw_error("Two axis specifiers in one step");
+
+ axis = parse_axis_name(nt_name, axis_specified);
+
+ if (!axis_specified) throw_error("Unknown axis");
+
+ // read actual node test
+ _lexer.next();
+
+ if (_lexer.current() == lex_multiply)
+ {
+ nt_type = nodetest_all;
+ nt_name = xpath_lexer_string();
+ _lexer.next();
+ }
+ else if (_lexer.current() == lex_string)
+ {
+ nt_name = _lexer.contents();
+ _lexer.next();
+ }
+ else throw_error("Unrecognized node test");
+ }
+
+ if (nt_type == nodetest_none)
+ {
+ // node type test or processing-instruction
+ if (_lexer.current() == lex_open_brace)
+ {
+ _lexer.next();
+
+ if (_lexer.current() == lex_close_brace)
+ {
+ _lexer.next();
+
+ nt_type = parse_node_test_type(nt_name);
+
+ if (nt_type == nodetest_none) throw_error("Unrecognized node type");
+
+ nt_name = xpath_lexer_string();
+ }
+ else if (nt_name == PUGIXML_TEXT("processing-instruction"))
+ {
+ if (_lexer.current() != lex_quoted_string)
+ throw_error("Only literals are allowed as arguments to processing-instruction()");
+
+ nt_type = nodetest_pi;
+ nt_name = _lexer.contents();
+ _lexer.next();
+
+ if (_lexer.current() != lex_close_brace)
+ throw_error("Unmatched brace near processing-instruction()");
+ _lexer.next();
+ }
+ else
+ throw_error("Unmatched brace near node type test");
+
+ }
+ // QName or NCName:*
+ else
+ {
+ if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') // NCName:*
+ {
+ nt_name.end--; // erase *
+
+ nt_type = nodetest_all_in_namespace;
+ }
+ else nt_type = nodetest_name;
+ }
+ }
+ }
+ else if (_lexer.current() == lex_multiply)
+ {
+ nt_type = nodetest_all;
+ _lexer.next();
+ }
+ else throw_error("Unrecognized node test");
+
+ xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name));
+
+ xpath_ast_node* last = 0;
+
+ while (_lexer.current() == lex_open_square_brace)
+ {
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_expression();
+
+ xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, xpath_type_node_set, expr);
+
+ if (_lexer.current() != lex_close_square_brace)
+ throw_error("Unmatched square brace");
+ _lexer.next();
+
+ if (last) last->set_next(pred);
+ else n->set_right(pred);
+
+ last = pred;
+ }
+
+ return n;
+ }
+
+ // RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step
+ xpath_ast_node* parse_relative_location_path(xpath_ast_node* set)
+ {
+ xpath_ast_node* n = parse_step(set);
+
+ while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+ {
+ lexeme_t l = _lexer.current();
+ _lexer.next();
+
+ if (l == lex_double_slash)
+ n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+
+ n = parse_step(n);
+ }
+
+ return n;
+ }
+
+ // LocationPath ::= RelativeLocationPath | AbsoluteLocationPath
+ // AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath
+ xpath_ast_node* parse_location_path()
+ {
+ if (_lexer.current() == lex_slash)
+ {
+ _lexer.next();
+
+ xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+
+ // relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path
+ lexeme_t l = _lexer.current();
+
+ if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply)
+ return parse_relative_location_path(n);
+ else
+ return n;
+ }
+ else if (_lexer.current() == lex_double_slash)
+ {
+ _lexer.next();
+
+ xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+ n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+
+ return parse_relative_location_path(n);
+ }
+
+ // else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1
+ return parse_relative_location_path(0);
+ }
+
+ // PathExpr ::= LocationPath
+ // | FilterExpr
+ // | FilterExpr '/' RelativeLocationPath
+ // | FilterExpr '//' RelativeLocationPath
+ xpath_ast_node* parse_path_expression()
+ {
+ // Clarification.
+ // PathExpr begins with either LocationPath or FilterExpr.
+ // FilterExpr begins with PrimaryExpr
+ // PrimaryExpr begins with '$' in case of it being a variable reference,
+ // '(' in case of it being an expression, string literal, number constant or
+ // function call.
+
+ if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace ||
+ _lexer.current() == lex_quoted_string || _lexer.current() == lex_number ||
+ _lexer.current() == lex_string)
+ {
+ if (_lexer.current() == lex_string)
+ {
+ // This is either a function call, or not - if not, we shall proceed with location path
+ const char_t* state = _lexer.state();
+
+ while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state;
+
+ if (*state != '(') return parse_location_path();
+
+ // This looks like a function call; however this still can be a node-test. Check it.
+ if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path();
+ }
+
+ xpath_ast_node* n = parse_filter_expression();
+
+ if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+ {
+ lexeme_t l = _lexer.current();
+ _lexer.next();
+
+ if (l == lex_double_slash)
+ {
+ if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set");
+
+ n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+ }
+
+ // select from location path
+ return parse_relative_location_path(n);
+ }
+
+ return n;
+ }
+ else return parse_location_path();
+ }
+
+ // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
+ xpath_ast_node* parse_union_expression()
+ {
+ xpath_ast_node* n = parse_path_expression();
+
+ while (_lexer.current() == lex_union)
+ {
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_union_expression();
+
+ if (n->rettype() != xpath_type_node_set || expr->rettype() != xpath_type_node_set)
+ throw_error("Union operator has to be applied to node sets");
+
+ n = new (alloc_node()) xpath_ast_node(ast_op_union, xpath_type_node_set, n, expr);
+ }
+
+ return n;
+ }
+
+ // UnaryExpr ::= UnionExpr | '-' UnaryExpr
+ xpath_ast_node* parse_unary_expression()
+ {
+ if (_lexer.current() == lex_minus)
+ {
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_unary_expression();
+
+ return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr);
+ }
+ else return parse_union_expression();
+ }
+
+ // MultiplicativeExpr ::= UnaryExpr
+ // | MultiplicativeExpr '*' UnaryExpr
+ // | MultiplicativeExpr 'div' UnaryExpr
+ // | MultiplicativeExpr 'mod' UnaryExpr
+ xpath_ast_node* parse_multiplicative_expression()
+ {
+ xpath_ast_node* n = parse_unary_expression();
+
+ while (_lexer.current() == lex_multiply || (_lexer.current() == lex_string &&
+ (_lexer.contents() == PUGIXML_TEXT("mod") || _lexer.contents() == PUGIXML_TEXT("div"))))
+ {
+ ast_type_t op = _lexer.current() == lex_multiply ? ast_op_multiply :
+ _lexer.contents().begin[0] == 'd' ? ast_op_divide : ast_op_mod;
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_unary_expression();
+
+ n = new (alloc_node()) xpath_ast_node(op, xpath_type_number, n, expr);
+ }
+
+ return n;
+ }
+
+ // AdditiveExpr ::= MultiplicativeExpr
+ // | AdditiveExpr '+' MultiplicativeExpr
+ // | AdditiveExpr '-' MultiplicativeExpr
+ xpath_ast_node* parse_additive_expression()
+ {
+ xpath_ast_node* n = parse_multiplicative_expression();
+
+ while (_lexer.current() == lex_plus || _lexer.current() == lex_minus)
+ {
+ lexeme_t l = _lexer.current();
+
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_multiplicative_expression();
+
+ n = new (alloc_node()) xpath_ast_node(l == lex_plus ? ast_op_add : ast_op_subtract, xpath_type_number, n, expr);
+ }
+
+ return n;
+ }
+
+ // RelationalExpr ::= AdditiveExpr
+ // | RelationalExpr '<' AdditiveExpr
+ // | RelationalExpr '>' AdditiveExpr
+ // | RelationalExpr '<=' AdditiveExpr
+ // | RelationalExpr '>=' AdditiveExpr
+ xpath_ast_node* parse_relational_expression()
+ {
+ xpath_ast_node* n = parse_additive_expression();
+
+ while (_lexer.current() == lex_less || _lexer.current() == lex_less_or_equal ||
+ _lexer.current() == lex_greater || _lexer.current() == lex_greater_or_equal)
+ {
+ lexeme_t l = _lexer.current();
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_additive_expression();
+
+ n = new (alloc_node()) xpath_ast_node(l == lex_less ? ast_op_less : l == lex_greater ? ast_op_greater :
+ l == lex_less_or_equal ? ast_op_less_or_equal : ast_op_greater_or_equal, xpath_type_boolean, n, expr);
+ }
+
+ return n;
+ }
+
+ // EqualityExpr ::= RelationalExpr
+ // | EqualityExpr '=' RelationalExpr
+ // | EqualityExpr '!=' RelationalExpr
+ xpath_ast_node* parse_equality_expression()
+ {
+ xpath_ast_node* n = parse_relational_expression();
+
+ while (_lexer.current() == lex_equal || _lexer.current() == lex_not_equal)
+ {
+ lexeme_t l = _lexer.current();
+
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_relational_expression();
+
+ n = new (alloc_node()) xpath_ast_node(l == lex_equal ? ast_op_equal : ast_op_not_equal, xpath_type_boolean, n, expr);
+ }
+
+ return n;
+ }
+
+ // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr
+ xpath_ast_node* parse_and_expression()
+ {
+ xpath_ast_node* n = parse_equality_expression();
+
+ while (_lexer.current() == lex_string && _lexer.contents() == PUGIXML_TEXT("and"))
+ {
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_equality_expression();
+
+ n = new (alloc_node()) xpath_ast_node(ast_op_and, xpath_type_boolean, n, expr);
+ }
+
+ return n;
+ }
+
+ // OrExpr ::= AndExpr | OrExpr 'or' AndExpr
+ xpath_ast_node* parse_or_expression()
+ {
+ xpath_ast_node* n = parse_and_expression();
+
+ while (_lexer.current() == lex_string && _lexer.contents() == PUGIXML_TEXT("or"))
+ {
+ _lexer.next();
+
+ xpath_ast_node* expr = parse_and_expression();
+
+ n = new (alloc_node()) xpath_ast_node(ast_op_or, xpath_type_boolean, n, expr);
+ }
+
+ return n;
+ }
+
+ // Expr ::= OrExpr
+ xpath_ast_node* parse_expression()
+ {
+ return parse_or_expression();
+ }
+
+ xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result)
+ {
+ }
+
+ xpath_ast_node* parse()
+ {
+ xpath_ast_node* result = parse_expression();
+
+ if (_lexer.current() != lex_eof)
+ {
+ // there are still unparsed tokens left, error
+ throw_error("Incorrect query");
+ }
+
+ return result;
+ }
+
+ static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result)
+ {
+ xpath_parser parser(query, variables, alloc, result);
+
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ int error = setjmp(parser._error_handler);
+
+ return (error == 0) ? parser.parse() : 0;
+ #else
+ return parser.parse();
+ #endif
+ }
+ };
+
+ struct xpath_query_impl
+ {
+ static xpath_query_impl* create()
+ {
+ void* memory = xml_memory::allocate(sizeof(xpath_query_impl));
+
+ return new (memory) xpath_query_impl();
+ }
+
+ static void destroy(void* ptr)
+ {
+ if (!ptr) return;
+
+ // free all allocated pages
+ static_cast<xpath_query_impl*>(ptr)->alloc.release();
+
+ // free allocator memory (with the first page)
+ xml_memory::deallocate(ptr);
+ }
+
+ xpath_query_impl(): root(0), alloc(&block)
+ {
+ block.next = 0;
+ }
+
+ xpath_ast_node* root;
+ xpath_allocator alloc;
+ xpath_memory_block block;
+ };
+
+ PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd)
+ {
+ if (!impl) return xpath_string();
+
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ if (setjmp(sd.error_handler)) return xpath_string();
+ #endif
+
+ xpath_context c(n, 1, 1);
+
+ return impl->root->eval_string(c, sd.stack);
+ }
+PUGI__NS_END
+
+namespace pugi
+{
+#ifndef PUGIXML_NO_EXCEPTIONS
+ PUGI__FN xpath_exception::xpath_exception(const xpath_parse_result& result_): _result(result_)
+ {
+ assert(_result.error);
+ }
+
+ PUGI__FN const char* xpath_exception::what() const throw()
+ {
+ return _result.error;
+ }
+
+ PUGI__FN const xpath_parse_result& xpath_exception::result() const
+ {
+ return _result;
+ }
+#endif
+
+ PUGI__FN xpath_node::xpath_node()
+ {
+ }
+
+ PUGI__FN xpath_node::xpath_node(const xml_node& node_): _node(node_)
+ {
+ }
+
+ PUGI__FN xpath_node::xpath_node(const xml_attribute& attribute_, const xml_node& parent_): _node(attribute_ ? parent_ : xml_node()), _attribute(attribute_)
+ {
+ }
+
+ PUGI__FN xml_node xpath_node::node() const
+ {
+ return _attribute ? xml_node() : _node;
+ }
+
+ PUGI__FN xml_attribute xpath_node::attribute() const
+ {
+ return _attribute;
+ }
+
+ PUGI__FN xml_node xpath_node::parent() const
+ {
+ return _attribute ? _node : _node.parent();
+ }
+
+ PUGI__FN static void unspecified_bool_xpath_node(xpath_node***)
+ {
+ }
+
+ PUGI__FN xpath_node::operator xpath_node::unspecified_bool_type() const
+ {
+ return (_node || _attribute) ? unspecified_bool_xpath_node : 0;
+ }
+
+ PUGI__FN bool xpath_node::operator!() const
+ {
+ return !(_node || _attribute);
+ }
+
+ PUGI__FN bool xpath_node::operator==(const xpath_node& n) const
+ {
+ return _node == n._node && _attribute == n._attribute;
+ }
+
+ PUGI__FN bool xpath_node::operator!=(const xpath_node& n) const
+ {
+ return _node != n._node || _attribute != n._attribute;
+ }
+
+#ifdef __BORLANDC__
+ PUGI__FN bool operator&&(const xpath_node& lhs, bool rhs)
+ {
+ return (bool)lhs && rhs;
+ }
+
+ PUGI__FN bool operator||(const xpath_node& lhs, bool rhs)
+ {
+ return (bool)lhs || rhs;
+ }
+#endif
+
+ PUGI__FN void xpath_node_set::_assign(const_iterator begin_, const_iterator end_)
+ {
+ assert(begin_ <= end_);
+
+ size_t size_ = static_cast<size_t>(end_ - begin_);
+
+ if (size_ <= 1)
+ {
+ // deallocate old buffer
+ if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+ // use internal buffer
+ if (begin_ != end_) _storage = *begin_;
+
+ _begin = &_storage;
+ _end = &_storage + size_;
+ }
+ else
+ {
+ // make heap copy
+ xpath_node* storage = static_cast<xpath_node*>(impl::xml_memory::allocate(size_ * sizeof(xpath_node)));
+
+ if (!storage)
+ {
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ return;
+ #else
+ throw std::bad_alloc();
+ #endif
+ }
+
+ memcpy(storage, begin_, size_ * sizeof(xpath_node));
+
+ // deallocate old buffer
+ if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+ // finalize
+ _begin = storage;
+ _end = storage + size_;
+ }
+ }
+
+ PUGI__FN xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage)
+ {
+ }
+
+ PUGI__FN xpath_node_set::xpath_node_set(const_iterator begin_, const_iterator end_, type_t type_): _type(type_), _begin(&_storage), _end(&_storage)
+ {
+ _assign(begin_, end_);
+ }
+
+ PUGI__FN xpath_node_set::~xpath_node_set()
+ {
+ if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+ }
+
+ PUGI__FN xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(ns._type), _begin(&_storage), _end(&_storage)
+ {
+ _assign(ns._begin, ns._end);
+ }
+
+ PUGI__FN xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns)
+ {
+ if (this == &ns) return *this;
+
+ _type = ns._type;
+ _assign(ns._begin, ns._end);
+
+ return *this;
+ }
+
+ PUGI__FN xpath_node_set::type_t xpath_node_set::type() const
+ {
+ return _type;
+ }
+
+ PUGI__FN size_t xpath_node_set::size() const
+ {
+ return _end - _begin;
+ }
+
+ PUGI__FN bool xpath_node_set::empty() const
+ {
+ return _begin == _end;
+ }
+
+ PUGI__FN const xpath_node& xpath_node_set::operator[](size_t index) const
+ {
+ assert(index < size());
+ return _begin[index];
+ }
+
+ PUGI__FN xpath_node_set::const_iterator xpath_node_set::begin() const
+ {
+ return _begin;
+ }
+
+ PUGI__FN xpath_node_set::const_iterator xpath_node_set::end() const
+ {
+ return _end;
+ }
+
+ PUGI__FN void xpath_node_set::sort(bool reverse)
+ {
+ _type = impl::xpath_sort(_begin, _end, _type, reverse);
+ }
+
+ PUGI__FN xpath_node xpath_node_set::first() const
+ {
+ return impl::xpath_first(_begin, _end, _type);
+ }
+
+ PUGI__FN xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0)
+ {
+ }
+
+ PUGI__FN xpath_parse_result::operator bool() const
+ {
+ return error == 0;
+ }
+
+ PUGI__FN const char* xpath_parse_result::description() const
+ {
+ return error ? error : "No error";
+ }
+
+ PUGI__FN xpath_variable::xpath_variable()
+ {
+ }
+
+ PUGI__FN const char_t* xpath_variable::name() const
+ {
+ switch (_type)
+ {
+ case xpath_type_node_set:
+ return static_cast<const impl::xpath_variable_node_set*>(this)->name;
+
+ case xpath_type_number:
+ return static_cast<const impl::xpath_variable_number*>(this)->name;
+
+ case xpath_type_string:
+ return static_cast<const impl::xpath_variable_string*>(this)->name;
+
+ case xpath_type_boolean:
+ return static_cast<const impl::xpath_variable_boolean*>(this)->name;
+
+ default:
+ assert(!"Invalid variable type");
+ return 0;
+ }
+ }
+
+ PUGI__FN xpath_value_type xpath_variable::type() const
+ {
+ return _type;
+ }
+
+ PUGI__FN bool xpath_variable::get_boolean() const
+ {
+ return (_type == xpath_type_boolean) ? static_cast<const impl::xpath_variable_boolean*>(this)->value : false;
+ }
+
+ PUGI__FN double xpath_variable::get_number() const
+ {
+ return (_type == xpath_type_number) ? static_cast<const impl::xpath_variable_number*>(this)->value : impl::gen_nan();
+ }
+
+ PUGI__FN const char_t* xpath_variable::get_string() const
+ {
+ const char_t* value = (_type == xpath_type_string) ? static_cast<const impl::xpath_variable_string*>(this)->value : 0;
+ return value ? value : PUGIXML_TEXT("");
+ }
+
+ PUGI__FN const xpath_node_set& xpath_variable::get_node_set() const
+ {
+ return (_type == xpath_type_node_set) ? static_cast<const impl::xpath_variable_node_set*>(this)->value : impl::dummy_node_set;
+ }
+
+ PUGI__FN bool xpath_variable::set(bool value)
+ {
+ if (_type != xpath_type_boolean) return false;
+
+ static_cast<impl::xpath_variable_boolean*>(this)->value = value;
+ return true;
+ }
+
+ PUGI__FN bool xpath_variable::set(double value)
+ {
+ if (_type != xpath_type_number) return false;
+
+ static_cast<impl::xpath_variable_number*>(this)->value = value;
+ return true;
+ }
+
+ PUGI__FN bool xpath_variable::set(const char_t* value)
+ {
+ if (_type != xpath_type_string) return false;
+
+ impl::xpath_variable_string* var = static_cast<impl::xpath_variable_string*>(this);
+
+ // duplicate string
+ size_t size = (impl::strlength(value) + 1) * sizeof(char_t);
+
+ char_t* copy = static_cast<char_t*>(impl::xml_memory::allocate(size));
+ if (!copy) return false;
+
+ memcpy(copy, value, size);
+
+ // replace old string
+ if (var->value) impl::xml_memory::deallocate(var->value);
+ var->value = copy;
+
+ return true;
+ }
+
+ PUGI__FN bool xpath_variable::set(const xpath_node_set& value)
+ {
+ if (_type != xpath_type_node_set) return false;
+
+ static_cast<impl::xpath_variable_node_set*>(this)->value = value;
+ return true;
+ }
+
+ PUGI__FN xpath_variable_set::xpath_variable_set()
+ {
+ for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) _data[i] = 0;
+ }
+
+ PUGI__FN xpath_variable_set::~xpath_variable_set()
+ {
+ for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i)
+ {
+ xpath_variable* var = _data[i];
+
+ while (var)
+ {
+ xpath_variable* next = var->_next;
+
+ impl::delete_xpath_variable(var->_type, var);
+
+ var = next;
+ }
+ }
+ }
+
+ PUGI__FN xpath_variable* xpath_variable_set::find(const char_t* name) const
+ {
+ const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+ size_t hash = impl::hash_string(name) % hash_size;
+
+ // look for existing variable
+ for (xpath_variable* var = _data[hash]; var; var = var->_next)
+ if (impl::strequal(var->name(), name))
+ return var;
+
+ return 0;
+ }
+
+ PUGI__FN xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type)
+ {
+ const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+ size_t hash = impl::hash_string(name) % hash_size;
+
+ // look for existing variable
+ for (xpath_variable* var = _data[hash]; var; var = var->_next)
+ if (impl::strequal(var->name(), name))
+ return var->type() == type ? var : 0;
+
+ // add new variable
+ xpath_variable* result = impl::new_xpath_variable(type, name);
+
+ if (result)
+ {
+ result->_type = type;
+ result->_next = _data[hash];
+
+ _data[hash] = result;
+ }
+
+ return result;
+ }
+
+ PUGI__FN bool xpath_variable_set::set(const char_t* name, bool value)
+ {
+ xpath_variable* var = add(name, xpath_type_boolean);
+ return var ? var->set(value) : false;
+ }
+
+ PUGI__FN bool xpath_variable_set::set(const char_t* name, double value)
+ {
+ xpath_variable* var = add(name, xpath_type_number);
+ return var ? var->set(value) : false;
+ }
+
+ PUGI__FN bool xpath_variable_set::set(const char_t* name, const char_t* value)
+ {
+ xpath_variable* var = add(name, xpath_type_string);
+ return var ? var->set(value) : false;
+ }
+
+ PUGI__FN bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value)
+ {
+ xpath_variable* var = add(name, xpath_type_node_set);
+ return var ? var->set(value) : false;
+ }
+
+ PUGI__FN xpath_variable* xpath_variable_set::get(const char_t* name)
+ {
+ return find(name);
+ }
+
+ PUGI__FN const xpath_variable* xpath_variable_set::get(const char_t* name) const
+ {
+ return find(name);
+ }
+
+ PUGI__FN xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0)
+ {
+ impl::xpath_query_impl* qimpl = impl::xpath_query_impl::create();
+
+ if (!qimpl)
+ {
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ _result.error = "Out of memory";
+ #else
+ throw std::bad_alloc();
+ #endif
+ }
+ else
+ {
+ impl::buffer_holder impl_holder(qimpl, impl::xpath_query_impl::destroy);
+
+ qimpl->root = impl::xpath_parser::parse(query, variables, &qimpl->alloc, &_result);
+
+ if (qimpl->root)
+ {
+ _impl = static_cast<impl::xpath_query_impl*>(impl_holder.release());
+ _result.error = 0;
+ }
+ }
+ }
+
+ PUGI__FN xpath_query::~xpath_query()
+ {
+ impl::xpath_query_impl::destroy(_impl);
+ }
+
+ PUGI__FN xpath_value_type xpath_query::return_type() const
+ {
+ if (!_impl) return xpath_type_none;
+
+ return static_cast<impl::xpath_query_impl*>(_impl)->root->rettype();
+ }
+
+ PUGI__FN bool xpath_query::evaluate_boolean(const xpath_node& n) const
+ {
+ if (!_impl) return false;
+
+ impl::xpath_context c(n, 1, 1);
+ impl::xpath_stack_data sd;
+
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ if (setjmp(sd.error_handler)) return false;
+ #endif
+
+ return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_boolean(c, sd.stack);
+ }
+
+ PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const
+ {
+ if (!_impl) return impl::gen_nan();
+
+ impl::xpath_context c(n, 1, 1);
+ impl::xpath_stack_data sd;
+
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ if (setjmp(sd.error_handler)) return impl::gen_nan();
+ #endif
+
+ return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_number(c, sd.stack);
+ }
+
+#ifndef PUGIXML_NO_STL
+ PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const
+ {
+ impl::xpath_stack_data sd;
+
+ return impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd).c_str();
+ }
+#endif
+
+ PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const
+ {
+ impl::xpath_stack_data sd;
+
+ impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+
+ size_t full_size = r.length() + 1;
+
+ if (capacity > 0)
+ {
+ size_t size = (full_size < capacity) ? full_size : capacity;
+ assert(size > 0);
+
+ memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t));
+ buffer[size - 1] = 0;
+ }
+
+ return full_size;
+ }
+
+ PUGI__FN xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const
+ {
+ if (!_impl) return xpath_node_set();
+
+ impl::xpath_ast_node* root = static_cast<impl::xpath_query_impl*>(_impl)->root;
+
+ if (root->rettype() != xpath_type_node_set)
+ {
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ return xpath_node_set();
+ #else
+ xpath_parse_result res;
+ res.error = "Expression does not evaluate to node set";
+
+ throw xpath_exception(res);
+ #endif
+ }
+
+ impl::xpath_context c(n, 1, 1);
+ impl::xpath_stack_data sd;
+
+ #ifdef PUGIXML_NO_EXCEPTIONS
+ if (setjmp(sd.error_handler)) return xpath_node_set();
+ #endif
+
+ impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack);
+
+ return xpath_node_set(r.begin(), r.end(), r.type());
+ }
+
+ PUGI__FN const xpath_parse_result& xpath_query::result() const
+ {
+ return _result;
+ }
+
+ PUGI__FN static void unspecified_bool_xpath_query(xpath_query***)
+ {
+ }
+
+ PUGI__FN xpath_query::operator xpath_query::unspecified_bool_type() const
+ {
+ return _impl ? unspecified_bool_xpath_query : 0;
+ }
+
+ PUGI__FN bool xpath_query::operator!() const
+ {
+ return !_impl;
+ }
+
+ PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const
+ {
+ xpath_query q(query, variables);
+ return select_single_node(q);
+ }
+
+ PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const
+ {
+ xpath_node_set s = query.evaluate_node_set(*this);
+ return s.empty() ? xpath_node() : s.first();
+ }
+
+ PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const
+ {
+ xpath_query q(query, variables);
+ return select_nodes(q);
+ }
+
+ PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const
+ {
+ return query.evaluate_node_set(*this);
+ }
+}
+
+#endif
+
+#ifdef __BORLANDC__
+# pragma option pop
+#endif
+
+// Intel C++ does not properly keep warning state for function templates,
+// so popping warning state at the end of translation unit leads to warnings in the middle.
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+# pragma warning(pop)
+#endif
+
+// Undefine all local macros (makes sure we're not leaking macros in header-only mode)
+#undef PUGI__NO_INLINE
+#undef PUGI__STATIC_ASSERT
+#undef PUGI__DMC_VOLATILE
+#undef PUGI__MSVC_CRT_VERSION
+#undef PUGI__NS_BEGIN
+#undef PUGI__NS_END
+#undef PUGI__FN
+#undef PUGI__FN_NO_INLINE
+#undef PUGI__IS_CHARTYPE_IMPL
+#undef PUGI__IS_CHARTYPE
+#undef PUGI__IS_CHARTYPEX
+#undef PUGI__SKIPWS
+#undef PUGI__OPTSET
+#undef PUGI__PUSHNODE
+#undef PUGI__POPNODE
+#undef PUGI__SCANFOR
+#undef PUGI__SCANWHILE
+#undef PUGI__ENDSEG
+#undef PUGI__THROW_ERROR
+#undef PUGI__CHECK_ERROR
+
+#endif
+
+/**
+ * Copyright (c) 2006-2012 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/phrase-extract/extract-mixed-syntax/pugixml.hpp b/phrase-extract/extract-mixed-syntax/pugixml.hpp
new file mode 100644
index 000000000..77b4dcf47
--- /dev/null
+++ b/phrase-extract/extract-mixed-syntax/pugixml.hpp
@@ -0,0 +1,1265 @@
+/**
+ * pugixml parser - version 1.2
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef PUGIXML_VERSION
+// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
+# define PUGIXML_VERSION 120
+#endif
+
+// Include user configuration file (this can define various configuration macros)
+#include "pugiconfig.hpp"
+
+#ifndef HEADER_PUGIXML_HPP
+#define HEADER_PUGIXML_HPP
+
+// Include stddef.h for size_t and ptrdiff_t
+#include <stddef.h>
+
+// Include exception header for XPath
+#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS)
+# include <exception>
+#endif
+
+// Include STL headers
+#ifndef PUGIXML_NO_STL
+# include <iterator>
+# include <iosfwd>
+# include <string>
+#endif
+
+// Macro for deprecated features
+#ifndef PUGIXML_DEPRECATED
+# if defined(__GNUC__)
+# define PUGIXML_DEPRECATED __attribute__((deprecated))
+# elif defined(_MSC_VER) && _MSC_VER >= 1300
+# define PUGIXML_DEPRECATED __declspec(deprecated)
+# else
+# define PUGIXML_DEPRECATED
+# endif
+#endif
+
+// If no API is defined, assume default
+#ifndef PUGIXML_API
+# define PUGIXML_API
+#endif
+
+// If no API for classes is defined, assume default
+#ifndef PUGIXML_CLASS
+# define PUGIXML_CLASS PUGIXML_API
+#endif
+
+// If no API for functions is defined, assume default
+#ifndef PUGIXML_FUNCTION
+# define PUGIXML_FUNCTION PUGIXML_API
+#endif
+
+// Character interface macros
+#ifdef PUGIXML_WCHAR_MODE
+# define PUGIXML_TEXT(t) L ## t
+# define PUGIXML_CHAR wchar_t
+#else
+# define PUGIXML_TEXT(t) t
+# define PUGIXML_CHAR char
+#endif
+
+namespace pugi
+{
+ // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
+ typedef PUGIXML_CHAR char_t;
+
+#ifndef PUGIXML_NO_STL
+ // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
+ typedef std::basic_string<PUGIXML_CHAR, std::char_traits<PUGIXML_CHAR>, std::allocator<PUGIXML_CHAR> > string_t;
+#endif
+}
+
+// The PugiXML namespace
+namespace pugi
+{
+ // Tree node types
+ enum xml_node_type
+ {
+ node_null, // Empty (null) node handle
+ node_document, // A document tree's absolute root
+ node_element, // Element tag, i.e. '<node/>'
+ node_pcdata, // Plain character data, i.e. 'text'
+ node_cdata, // Character data, i.e. '<![CDATA[text]]>'
+ node_comment, // Comment tag, i.e. '<!-- text -->'
+ node_pi, // Processing instruction, i.e. '<?name?>'
+ node_declaration, // Document declaration, i.e. '<?xml version="1.0"?>'
+ node_doctype // Document type declaration, i.e. '<!DOCTYPE doc>'
+ };
+
+ // Parsing options
+
+ // Minimal parsing mode (equivalent to turning all other flags off).
+ // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
+ const unsigned int parse_minimal = 0x0000;
+
+ // This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
+ const unsigned int parse_pi = 0x0001;
+
+ // This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
+ const unsigned int parse_comments = 0x0002;
+
+ // This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
+ const unsigned int parse_cdata = 0x0004;
+
+ // This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
+ // This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
+ const unsigned int parse_ws_pcdata = 0x0008;
+
+ // This flag determines if character and entity references are expanded during parsing. This flag is on by default.
+ const unsigned int parse_escapes = 0x0010;
+
+ // This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
+ const unsigned int parse_eol = 0x0020;
+
+ // This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
+ const unsigned int parse_wconv_attribute = 0x0040;
+
+ // This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
+ const unsigned int parse_wnorm_attribute = 0x0080;
+
+ // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
+ const unsigned int parse_declaration = 0x0100;
+
+ // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
+ const unsigned int parse_doctype = 0x0200;
+
+ // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
+ // of whitespace is added to the DOM tree.
+ // This flag is off by default; turning it on may result in slower parsing and more memory consumption.
+ const unsigned int parse_ws_pcdata_single = 0x0400;
+
+ // The default parsing mode.
+ // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
+ // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+ const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
+
+ // The full parsing mode.
+ // Nodes of all types are added to the DOM tree, character/reference entities are expanded,
+ // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+ const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
+
+ // These flags determine the encoding of input data for XML document
+ enum xml_encoding
+ {
+ encoding_auto, // Auto-detect input encoding using BOM or < / <? detection; use UTF8 if BOM is not found
+ encoding_utf8, // UTF8 encoding
+ encoding_utf16_le, // Little-endian UTF16
+ encoding_utf16_be, // Big-endian UTF16
+ encoding_utf16, // UTF16 with native endianness
+ encoding_utf32_le, // Little-endian UTF32
+ encoding_utf32_be, // Big-endian UTF32
+ encoding_utf32, // UTF32 with native endianness
+ encoding_wchar, // The same encoding wchar_t has (either UTF16 or UTF32)
+ encoding_latin1
+ };
+
+ // Formatting flags
+
+ // Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
+ const unsigned int format_indent = 0x01;
+
+ // Write encoding-specific BOM to the output stream. This flag is off by default.
+ const unsigned int format_write_bom = 0x02;
+
+ // Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
+ const unsigned int format_raw = 0x04;
+
+ // Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
+ const unsigned int format_no_declaration = 0x08;
+
+ // Don't escape attribute values and PCDATA contents. This flag is off by default.
+ const unsigned int format_no_escapes = 0x10;
+
+ // Open file using text mode in xml_document::save_file. This enables special character (i.e. new-line) conversions on some systems. This flag is off by default.
+ const unsigned int format_save_file_text = 0x20;
+
+ // The default set of formatting flags.
+ // Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
+ const unsigned int format_default = format_indent;
+
+ // Forward declarations
+ struct xml_attribute_struct;
+ struct xml_node_struct;
+
+ class xml_node_iterator;
+ class xml_attribute_iterator;
+ class xml_named_node_iterator;
+
+ class xml_tree_walker;
+
+ class xml_node;
+
+ class xml_text;
+
+ #ifndef PUGIXML_NO_XPATH
+ class xpath_node;
+ class xpath_node_set;
+ class xpath_query;
+ class xpath_variable_set;
+ #endif
+
+ // Range-based for loop support
+ template <typename It> class xml_object_range
+ {
+ public:
+ typedef It const_iterator;
+
+ xml_object_range(It b, It e): _begin(b), _end(e)
+ {
+ }
+
+ It begin() const { return _begin; }
+ It end() const { return _end; }
+
+ private:
+ It _begin, _end;
+ };
+
+ // Writer interface for node printing (see xml_node::print)
+ class PUGIXML_CLASS xml_writer
+ {
+ public:
+ virtual ~xml_writer() {}
+
+ // Write memory chunk into stream/file/whatever
+ virtual void write(const void* data, size_t size) = 0;
+ };
+
+ // xml_writer implementation for FILE*
+ class PUGIXML_CLASS xml_writer_file: public xml_writer
+ {
+ public:
+ // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
+ xml_writer_file(void* file);
+
+ virtual void write(const void* data, size_t size);
+
+ private:
+ void* file;
+ };
+
+ #ifndef PUGIXML_NO_STL
+ // xml_writer implementation for streams
+ class PUGIXML_CLASS xml_writer_stream: public xml_writer
+ {
+ public:
+ // Construct writer from an output stream object
+ xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
+ xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
+
+ virtual void write(const void* data, size_t size);
+
+ private:
+ std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
+ std::basic_ostream<wchar_t, std::char_traits<wchar_t> >* wide_stream;
+ };
+ #endif
+
+ // A light-weight handle for manipulating attributes in DOM tree
+ class PUGIXML_CLASS xml_attribute
+ {
+ friend class xml_attribute_iterator;
+ friend class xml_node;
+
+ private:
+ xml_attribute_struct* _attr;
+
+ typedef void (*unspecified_bool_type)(xml_attribute***);
+
+ public:
+ // Default constructor. Constructs an empty attribute.
+ xml_attribute();
+
+ // Constructs attribute from internal pointer
+ explicit xml_attribute(xml_attribute_struct* attr);
+
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
+
+ // Borland C++ workaround
+ bool operator!() const;
+
+ // Comparison operators (compares wrapped attribute pointers)
+ bool operator==(const xml_attribute& r) const;
+ bool operator!=(const xml_attribute& r) const;
+ bool operator<(const xml_attribute& r) const;
+ bool operator>(const xml_attribute& r) const;
+ bool operator<=(const xml_attribute& r) const;
+ bool operator>=(const xml_attribute& r) const;
+
+ // Check if attribute is empty
+ bool empty() const;
+
+ // Get attribute name/value, or "" if attribute is empty
+ const char_t* name() const;
+ const char_t* value() const;
+
+ // Get attribute value, or the default value if attribute is empty
+ const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+ // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty
+ int as_int(int def = 0) const;
+ unsigned int as_uint(unsigned int def = 0) const;
+ double as_double(double def = 0) const;
+ float as_float(float def = 0) const;
+
+ // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty
+ bool as_bool(bool def = false) const;
+
+ // Set attribute name/value (returns false if attribute is empty or there is not enough memory)
+ bool set_name(const char_t* rhs);
+ bool set_value(const char_t* rhs);
+
+ // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+ bool set_value(int rhs);
+ bool set_value(unsigned int rhs);
+ bool set_value(double rhs);
+ bool set_value(bool rhs);
+
+ // Set attribute value (equivalent to set_value without error checking)
+ xml_attribute& operator=(const char_t* rhs);
+ xml_attribute& operator=(int rhs);
+ xml_attribute& operator=(unsigned int rhs);
+ xml_attribute& operator=(double rhs);
+ xml_attribute& operator=(bool rhs);
+
+ // Get next/previous attribute in the attribute list of the parent node
+ xml_attribute next_attribute() const;
+ xml_attribute previous_attribute() const;
+
+ // Get hash value (unique for handles to the same object)
+ size_t hash_value() const;
+
+ // Get internal pointer
+ xml_attribute_struct* internal_object() const;
+ };
+
+#ifdef __BORLANDC__
+ // Borland C++ workaround
+ bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
+ bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
+#endif
+
+ // A light-weight handle for manipulating nodes in DOM tree
+ class PUGIXML_CLASS xml_node
+ {
+ friend class xml_attribute_iterator;
+ friend class xml_node_iterator;
+ friend class xml_named_node_iterator;
+
+ protected:
+ xml_node_struct* _root;
+
+ typedef void (*unspecified_bool_type)(xml_node***);
+
+ public:
+ // Default constructor. Constructs an empty node.
+ xml_node();
+
+ // Constructs node from internal pointer
+ explicit xml_node(xml_node_struct* p);
+
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
+
+ // Borland C++ workaround
+ bool operator!() const;
+
+ // Comparison operators (compares wrapped node pointers)
+ bool operator==(const xml_node& r) const;
+ bool operator!=(const xml_node& r) const;
+ bool operator<(const xml_node& r) const;
+ bool operator>(const xml_node& r) const;
+ bool operator<=(const xml_node& r) const;
+ bool operator>=(const xml_node& r) const;
+
+ // Check if node is empty.
+ bool empty() const;
+
+ // Get node type
+ xml_node_type type() const;
+
+ // Get node name/value, or "" if node is empty or it has no name/value
+ const char_t* name() const;
+ const char_t* value() const;
+
+ // Get attribute list
+ xml_attribute first_attribute() const;
+ xml_attribute last_attribute() const;
+
+ // Get children list
+ xml_node first_child() const;
+ xml_node last_child() const;
+
+ // Get next/previous sibling in the children list of the parent node
+ xml_node next_sibling() const;
+ xml_node previous_sibling() const;
+
+ // Get parent node
+ xml_node parent() const;
+
+ // Get root of DOM tree this node belongs to
+ xml_node root() const;
+
+ // Get text object for the current node
+ xml_text text() const;
+
+ // Get child, attribute or next/previous sibling with the specified name
+ xml_node child(const char_t* name) const;
+ xml_attribute attribute(const char_t* name) const;
+ xml_node next_sibling(const char_t* name) const;
+ xml_node previous_sibling(const char_t* name) const;
+
+ // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
+ const char_t* child_value() const;
+
+ // Get child value of child with specified name. Equivalent to child(name).child_value().
+ const char_t* child_value(const char_t* name) const;
+
+ // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
+ bool set_name(const char_t* rhs);
+ bool set_value(const char_t* rhs);
+
+ // Add attribute with specified name. Returns added attribute, or empty attribute on errors.
+ xml_attribute append_attribute(const char_t* name);
+ xml_attribute prepend_attribute(const char_t* name);
+ xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
+ xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
+
+ // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
+ xml_attribute append_copy(const xml_attribute& proto);
+ xml_attribute prepend_copy(const xml_attribute& proto);
+ xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
+ xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
+
+ // Add child node with specified type. Returns added node, or empty node on errors.
+ xml_node append_child(xml_node_type type = node_element);
+ xml_node prepend_child(xml_node_type type = node_element);
+ xml_node insert_child_after(xml_node_type type, const xml_node& node);
+ xml_node insert_child_before(xml_node_type type, const xml_node& node);
+
+ // Add child element with specified name. Returns added node, or empty node on errors.
+ xml_node append_child(const char_t* name);
+ xml_node prepend_child(const char_t* name);
+ xml_node insert_child_after(const char_t* name, const xml_node& node);
+ xml_node insert_child_before(const char_t* name, const xml_node& node);
+
+ // Add a copy of the specified node as a child. Returns added node, or empty node on errors.
+ xml_node append_copy(const xml_node& proto);
+ xml_node prepend_copy(const xml_node& proto);
+ xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
+ xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
+
+ // Remove specified attribute
+ bool remove_attribute(const xml_attribute& a);
+ bool remove_attribute(const char_t* name);
+
+ // Remove specified child
+ bool remove_child(const xml_node& n);
+ bool remove_child(const char_t* name);
+
+ // Find attribute using predicate. Returns first attribute for which predicate returned true.
+ template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
+ {
+ if (!_root) return xml_attribute();
+
+ for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
+ if (pred(attrib))
+ return attrib;
+
+ return xml_attribute();
+ }
+
+ // Find child node using predicate. Returns first child for which predicate returned true.
+ template <typename Predicate> xml_node find_child(Predicate pred) const
+ {
+ if (!_root) return xml_node();
+
+ for (xml_node node = first_child(); node; node = node.next_sibling())
+ if (pred(node))
+ return node;
+
+ return xml_node();
+ }
+
+ // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
+ template <typename Predicate> xml_node find_node(Predicate pred) const
+ {
+ if (!_root) return xml_node();
+
+ xml_node cur = first_child();
+
+ while (cur._root && cur._root != _root)
+ {
+ if (pred(cur)) return cur;
+
+ if (cur.first_child()) cur = cur.first_child();
+ else if (cur.next_sibling()) cur = cur.next_sibling();
+ else
+ {
+ while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
+
+ if (cur._root != _root) cur = cur.next_sibling();
+ }
+ }
+
+ return xml_node();
+ }
+
+ // Find child node by attribute name/value
+ xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
+ xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
+
+ #ifndef PUGIXML_NO_STL
+ // Get the absolute node path from root as a text string.
+ string_t path(char_t delimiter = '/') const;
+ #endif
+
+ // Search for a node by path consisting of node names and . or .. elements.
+ xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
+
+ // Recursively traverse subtree with xml_tree_walker
+ bool traverse(xml_tree_walker& walker);
+
+ #ifndef PUGIXML_NO_XPATH
+ // Select single node by evaluating XPath query. Returns first node from the resulting node set.
+ xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+ xpath_node select_single_node(const xpath_query& query) const;
+
+ // Select node set by evaluating XPath query
+ xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
+ xpath_node_set select_nodes(const xpath_query& query) const;
+ #endif
+
+ // Print subtree using a writer object
+ void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+
+ #ifndef PUGIXML_NO_STL
+ // Print subtree to stream
+ void print(std::basic_ostream<char, std::char_traits<char> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+ void print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
+ #endif
+
+ // Child nodes iterators
+ typedef xml_node_iterator iterator;
+
+ iterator begin() const;
+ iterator end() const;
+
+ // Attribute iterators
+ typedef xml_attribute_iterator attribute_iterator;
+
+ attribute_iterator attributes_begin() const;
+ attribute_iterator attributes_end() const;
+
+ // Range-based for support
+ xml_object_range<xml_node_iterator> children() const;
+ xml_object_range<xml_named_node_iterator> children(const char_t* name) const;
+ xml_object_range<xml_attribute_iterator> attributes() const;
+
+ // Get node offset in parsed file/string (in char_t units) for debugging purposes
+ ptrdiff_t offset_debug() const;
+
+ // Get hash value (unique for handles to the same object)
+ size_t hash_value() const;
+
+ // Get internal pointer
+ xml_node_struct* internal_object() const;
+ };
+
+#ifdef __BORLANDC__
+ // Borland C++ workaround
+ bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
+ bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
+#endif
+
+ // A helper for working with text inside PCDATA nodes
+ class PUGIXML_CLASS xml_text
+ {
+ friend class xml_node;
+
+ xml_node_struct* _root;
+
+ typedef void (*unspecified_bool_type)(xml_text***);
+
+ explicit xml_text(xml_node_struct* root);
+
+ xml_node_struct* _data_new();
+ xml_node_struct* _data() const;
+
+ public:
+ // Default constructor. Constructs an empty object.
+ xml_text();
+
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
+
+ // Borland C++ workaround
+ bool operator!() const;
+
+ // Check if text object is empty
+ bool empty() const;
+
+ // Get text, or "" if object is empty
+ const char_t* get() const;
+
+ // Get text, or the default value if object is empty
+ const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+ // Get text as a number, or the default value if conversion did not succeed or object is empty
+ int as_int(int def = 0) const;
+ unsigned int as_uint(unsigned int def = 0) const;
+ double as_double(double def = 0) const;
+ float as_float(float def = 0) const;
+
+ // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty
+ bool as_bool(bool def = false) const;
+
+ // Set text (returns false if object is empty or there is not enough memory)
+ bool set(const char_t* rhs);
+
+ // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+ bool set(int rhs);
+ bool set(unsigned int rhs);
+ bool set(double rhs);
+ bool set(bool rhs);
+
+ // Set text (equivalent to set without error checking)
+ xml_text& operator=(const char_t* rhs);
+ xml_text& operator=(int rhs);
+ xml_text& operator=(unsigned int rhs);
+ xml_text& operator=(double rhs);
+ xml_text& operator=(bool rhs);
+
+ // Get the data node (node_pcdata or node_cdata) for this object
+ xml_node data() const;
+ };
+
+#ifdef __BORLANDC__
+ // Borland C++ workaround
+ bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs);
+ bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs);
+#endif
+
+ // Child node iterator (a bidirectional iterator over a collection of xml_node)
+ class PUGIXML_CLASS xml_node_iterator
+ {
+ friend class xml_node;
+
+ private:
+ mutable xml_node _wrap;
+ xml_node _parent;
+
+ xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
+
+ public:
+ // Iterator traits
+ typedef ptrdiff_t difference_type;
+ typedef xml_node value_type;
+ typedef xml_node* pointer;
+ typedef xml_node& reference;
+
+ #ifndef PUGIXML_NO_STL
+ typedef std::bidirectional_iterator_tag iterator_category;
+ #endif
+
+ // Default constructor
+ xml_node_iterator();
+
+ // Construct an iterator which points to the specified node
+ xml_node_iterator(const xml_node& node);
+
+ // Iterator operators
+ bool operator==(const xml_node_iterator& rhs) const;
+ bool operator!=(const xml_node_iterator& rhs) const;
+
+ xml_node& operator*() const;
+ xml_node* operator->() const;
+
+ const xml_node_iterator& operator++();
+ xml_node_iterator operator++(int);
+
+ const xml_node_iterator& operator--();
+ xml_node_iterator operator--(int);
+ };
+
+ // Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
+ class PUGIXML_CLASS xml_attribute_iterator
+ {
+ friend class xml_node;
+
+ private:
+ mutable xml_attribute _wrap;
+ xml_node _parent;
+
+ xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
+
+ public:
+ // Iterator traits
+ typedef ptrdiff_t difference_type;
+ typedef xml_attribute value_type;
+ typedef xml_attribute* pointer;
+ typedef xml_attribute& reference;
+
+ #ifndef PUGIXML_NO_STL
+ typedef std::bidirectional_iterator_tag iterator_category;
+ #endif
+
+ // Default constructor
+ xml_attribute_iterator();
+
+ // Construct an iterator which points to the specified attribute
+ xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
+
+ // Iterator operators
+ bool operator==(const xml_attribute_iterator& rhs) const;
+ bool operator!=(const xml_attribute_iterator& rhs) const;
+
+ xml_attribute& operator*() const;
+ xml_attribute* operator->() const;
+
+ const xml_attribute_iterator& operator++();
+ xml_attribute_iterator operator++(int);
+
+ const xml_attribute_iterator& operator--();
+ xml_attribute_iterator operator--(int);
+ };
+
+ // Named node range helper
+ class xml_named_node_iterator
+ {
+ public:
+ // Iterator traits
+ typedef ptrdiff_t difference_type;
+ typedef xml_node value_type;
+ typedef xml_node* pointer;
+ typedef xml_node& reference;
+
+ #ifndef PUGIXML_NO_STL
+ typedef std::forward_iterator_tag iterator_category;
+ #endif
+
+ // Default constructor
+ xml_named_node_iterator();
+
+ // Construct an iterator which points to the specified node
+ xml_named_node_iterator(const xml_node& node, const char_t* name);
+
+ // Iterator operators
+ bool operator==(const xml_named_node_iterator& rhs) const;
+ bool operator!=(const xml_named_node_iterator& rhs) const;
+
+ xml_node& operator*() const;
+ xml_node* operator->() const;
+
+ const xml_named_node_iterator& operator++();
+ xml_named_node_iterator operator++(int);
+
+ private:
+ mutable xml_node _node;
+ const char_t* _name;
+ };
+
+ // Abstract tree walker class (see xml_node::traverse)
+ class PUGIXML_CLASS xml_tree_walker
+ {
+ friend class xml_node;
+
+ private:
+ int _depth;
+
+ protected:
+ // Get current traversal depth
+ int depth() const;
+
+ public:
+ xml_tree_walker();
+ virtual ~xml_tree_walker();
+
+ // Callback that is called when traversal begins
+ virtual bool begin(xml_node& node);
+
+ // Callback that is called for each node traversed
+ virtual bool for_each(xml_node& node) = 0;
+
+ // Callback that is called when traversal ends
+ virtual bool end(xml_node& node);
+ };
+
+ // Parsing status, returned as part of xml_parse_result object
+ enum xml_parse_status
+ {
+ status_ok = 0, // No error
+
+ status_file_not_found, // File was not found during load_file()
+ status_io_error, // Error reading from file/stream
+ status_out_of_memory, // Could not allocate memory
+ status_internal_error, // Internal error occurred
+
+ status_unrecognized_tag, // Parser could not determine tag type
+
+ status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction
+ status_bad_comment, // Parsing error occurred while parsing comment
+ status_bad_cdata, // Parsing error occurred while parsing CDATA section
+ status_bad_doctype, // Parsing error occurred while parsing document type declaration
+ status_bad_pcdata, // Parsing error occurred while parsing PCDATA section
+ status_bad_start_element, // Parsing error occurred while parsing start element tag
+ status_bad_attribute, // Parsing error occurred while parsing element attribute
+ status_bad_end_element, // Parsing error occurred while parsing end element tag
+ status_end_element_mismatch // There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
+ };
+
+ // Parsing result
+ struct PUGIXML_CLASS xml_parse_result
+ {
+ // Parsing status (see xml_parse_status)
+ xml_parse_status status;
+
+ // Last parsed offset (in char_t units from start of input data)
+ ptrdiff_t offset;
+
+ // Source document encoding
+ xml_encoding encoding;
+
+ // Default constructor, initializes object to failed state
+ xml_parse_result();
+
+ // Cast to bool operator
+ operator bool() const;
+
+ // Get error description
+ const char* description() const;
+ };
+
+ // Document class (DOM tree root)
+ class PUGIXML_CLASS xml_document: public xml_node
+ {
+ private:
+ char_t* _buffer;
+
+ char _memory[192];
+
+ // Non-copyable semantics
+ xml_document(const xml_document&);
+ const xml_document& operator=(const xml_document&);
+
+ void create();
+ void destroy();
+
+ xml_parse_result load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own);
+
+ public:
+ // Default constructor, makes empty document
+ xml_document();
+
+ // Destructor, invalidates all node/attribute handles to this document
+ ~xml_document();
+
+ // Removes all nodes, leaving the empty document
+ void reset();
+
+ // Removes all nodes, then copies the entire contents of the specified document
+ void reset(const xml_document& proto);
+
+ #ifndef PUGIXML_NO_STL
+ // Load document from stream.
+ xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+ xml_parse_result load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options = parse_default);
+ #endif
+
+ // Load document from zero-terminated string. No encoding conversions are applied.
+ xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+
+ // Load document from file
+ xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+ xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+ // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
+ xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+ // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+ // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
+ xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+ // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+ // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
+ xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+ // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
+ void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+ #ifndef PUGIXML_NO_STL
+ // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
+ void save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+ void save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
+ #endif
+
+ // Save XML to file
+ bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+ bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+ // Get document element
+ xml_node document_element() const;
+ };
+
+#ifndef PUGIXML_NO_XPATH
+ // XPath query return type
+ enum xpath_value_type
+ {
+ xpath_type_none, // Unknown type (query failed to compile)
+ xpath_type_node_set, // Node set (xpath_node_set)
+ xpath_type_number, // Number
+ xpath_type_string, // String
+ xpath_type_boolean // Boolean
+ };
+
+ // XPath parsing result
+ struct PUGIXML_CLASS xpath_parse_result
+ {
+ // Error message (0 if no error)
+ const char* error;
+
+ // Last parsed offset (in char_t units from string start)
+ ptrdiff_t offset;
+
+ // Default constructor, initializes object to failed state
+ xpath_parse_result();
+
+ // Cast to bool operator
+ operator bool() const;
+
+ // Get error description
+ const char* description() const;
+ };
+
+ // A single XPath variable
+ class PUGIXML_CLASS xpath_variable
+ {
+ friend class xpath_variable_set;
+
+ protected:
+ xpath_value_type _type;
+ xpath_variable* _next;
+
+ xpath_variable();
+
+ // Non-copyable semantics
+ xpath_variable(const xpath_variable&);
+ xpath_variable& operator=(const xpath_variable&);
+
+ public:
+ // Get variable name
+ const char_t* name() const;
+
+ // Get variable type
+ xpath_value_type type() const;
+
+ // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
+ bool get_boolean() const;
+ double get_number() const;
+ const char_t* get_string() const;
+ const xpath_node_set& get_node_set() const;
+
+ // Set variable value; no type conversion is performed, false is returned on type mismatch error
+ bool set(bool value);
+ bool set(double value);
+ bool set(const char_t* value);
+ bool set(const xpath_node_set& value);
+ };
+
+ // A set of XPath variables
+ class PUGIXML_CLASS xpath_variable_set
+ {
+ private:
+ xpath_variable* _data[64];
+
+ // Non-copyable semantics
+ xpath_variable_set(const xpath_variable_set&);
+ xpath_variable_set& operator=(const xpath_variable_set&);
+
+ xpath_variable* find(const char_t* name) const;
+
+ public:
+ // Default constructor/destructor
+ xpath_variable_set();
+ ~xpath_variable_set();
+
+ // Add a new variable or get the existing one, if the types match
+ xpath_variable* add(const char_t* name, xpath_value_type type);
+
+ // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
+ bool set(const char_t* name, bool value);
+ bool set(const char_t* name, double value);
+ bool set(const char_t* name, const char_t* value);
+ bool set(const char_t* name, const xpath_node_set& value);
+
+ // Get existing variable by name
+ xpath_variable* get(const char_t* name);
+ const xpath_variable* get(const char_t* name) const;
+ };
+
+ // A compiled XPath query object
+ class PUGIXML_CLASS xpath_query
+ {
+ private:
+ void* _impl;
+ xpath_parse_result _result;
+
+ typedef void (*unspecified_bool_type)(xpath_query***);
+
+ // Non-copyable semantics
+ xpath_query(const xpath_query&);
+ xpath_query& operator=(const xpath_query&);
+
+ public:
+ // Construct a compiled object from XPath expression.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
+ explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0);
+
+ // Destructor
+ ~xpath_query();
+
+ // Get query expression return type
+ xpath_value_type return_type() const;
+
+ // Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+ bool evaluate_boolean(const xpath_node& n) const;
+
+ // Evaluate expression as double value in the specified context; performs type conversion if necessary.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+ double evaluate_number(const xpath_node& n) const;
+
+ #ifndef PUGIXML_NO_STL
+ // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+ string_t evaluate_string(const xpath_node& n) const;
+ #endif
+
+ // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+ // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+ // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead.
+ size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
+
+ // Evaluate expression as node set in the specified context.
+ // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+ // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
+ xpath_node_set evaluate_node_set(const xpath_node& n) const;
+
+ // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
+ const xpath_parse_result& result() const;
+
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
+
+ // Borland C++ workaround
+ bool operator!() const;
+ };
+
+ #ifndef PUGIXML_NO_EXCEPTIONS
+ // XPath exception class
+ class PUGIXML_CLASS xpath_exception: public std::exception
+ {
+ private:
+ xpath_parse_result _result;
+
+ public:
+ // Construct exception from parse result
+ explicit xpath_exception(const xpath_parse_result& result);
+
+ // Get error message
+ virtual const char* what() const throw();
+
+ // Get parse result
+ const xpath_parse_result& result() const;
+ };
+ #endif
+
+ // XPath node class (either xml_node or xml_attribute)
+ class PUGIXML_CLASS xpath_node
+ {
+ private:
+ xml_node _node;
+ xml_attribute _attribute;
+
+ typedef void (*unspecified_bool_type)(xpath_node***);
+
+ public:
+ // Default constructor; constructs empty XPath node
+ xpath_node();
+
+ // Construct XPath node from XML node/attribute
+ xpath_node(const xml_node& node);
+ xpath_node(const xml_attribute& attribute, const xml_node& parent);
+
+ // Get node/attribute, if any
+ xml_node node() const;
+ xml_attribute attribute() const;
+
+ // Get parent of contained node/attribute
+ xml_node parent() const;
+
+ // Safe bool conversion operator
+ operator unspecified_bool_type() const;
+
+ // Borland C++ workaround
+ bool operator!() const;
+
+ // Comparison operators
+ bool operator==(const xpath_node& n) const;
+ bool operator!=(const xpath_node& n) const;
+ };
+
+#ifdef __BORLANDC__
+ // Borland C++ workaround
+ bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
+ bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
+#endif
+
+ // A fixed-size collection of XPath nodes
+ class PUGIXML_CLASS xpath_node_set
+ {
+ public:
+ // Collection type
+ enum type_t
+ {
+ type_unsorted, // Not ordered
+ type_sorted, // Sorted by document order (ascending)
+ type_sorted_reverse // Sorted by document order (descending)
+ };
+
+ // Constant iterator type
+ typedef const xpath_node* const_iterator;
+
+ // Default constructor. Constructs empty set.
+ xpath_node_set();
+
+ // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
+ xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
+
+ // Destructor
+ ~xpath_node_set();
+
+ // Copy constructor/assignment operator
+ xpath_node_set(const xpath_node_set& ns);
+ xpath_node_set& operator=(const xpath_node_set& ns);
+
+ // Get collection type
+ type_t type() const;
+
+ // Get collection size
+ size_t size() const;
+
+ // Indexing operator
+ const xpath_node& operator[](size_t index) const;
+
+ // Collection iterators
+ const_iterator begin() const;
+ const_iterator end() const;
+
+ // Sort the collection in ascending/descending order by document order
+ void sort(bool reverse = false);
+
+ // Get first node in the collection by document order
+ xpath_node first() const;
+
+ // Check if collection is empty
+ bool empty() const;
+
+ private:
+ type_t _type;
+
+ xpath_node _storage;
+
+ xpath_node* _begin;
+ xpath_node* _end;
+
+ void _assign(const_iterator begin, const_iterator end);
+ };
+#endif
+
+#ifndef PUGIXML_NO_STL
+ // Convert wide string to UTF8
+ std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
+ std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
+
+ // Convert UTF8 to wide string
+ std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
+ std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
+#endif
+
+ // Memory allocation function interface; returns pointer to allocated memory or NULL on failure
+ typedef void* (*allocation_function)(size_t size);
+
+ // Memory deallocation function interface
+ typedef void (*deallocation_function)(void* ptr);
+
+ // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
+ void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
+
+ // Get current memory management functions
+ allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
+ deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+ // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+ std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
+ std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
+ std::forward_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+ // Workarounds for (non-standard) iterator category detection
+ std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
+ std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
+ std::forward_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#endif
+
+/**
+ * Copyright (c) 2006-2012 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp
deleted file mode 100644
index b37309d47..000000000
--- a/phrase-extract/extract-ordering-main.cpp
+++ /dev/null
@@ -1,687 +0,0 @@
-/*
- * extract.cpp
- * Modified by: Rohit Gupta CDAC, Mumbai, India
- * on July 15, 2012 to implement parallel processing
- * Modified by: Nadi Tomeh - LIMSI/CNRS
- * Machine Translation Marathon 2010, Dublin
- */
-
-#include <cstdio>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <string>
-#include <stdlib.h>
-#include <assert.h>
-#include <cstring>
-#include <sstream>
-#include <map>
-#include <set>
-#include <vector>
-
-#include "SafeGetline.h"
-#include "SentenceAlignment.h"
-#include "tables-core.h"
-#include "InputFileStream.h"
-#include "OutputFileStream.h"
-#include "PhraseExtractionOptions.h"
-
-using namespace std;
-using namespace MosesTraining;
-
-namespace MosesTraining
-{
-
-
-const long int LINE_MAX_LENGTH = 500000 ;
-
-
-// HPhraseVertex represents a point in the alignment matrix
-typedef pair <int, int> HPhraseVertex;
-
-// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix:
-// bottom-left and top-right
-typedef pair<HPhraseVertex, HPhraseVertex> HPhrase;
-
-// HPhraseVector is a vector of HPhrases
-typedef vector < HPhrase > HPhraseVector;
-
-// SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning
-// The key of the map is the English index and the value is a set of the source ones
-typedef map <int, set<int> > HSentenceVertices;
-
-REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
- int, int, int, int, int, int, int,
- bool (*)(int, int), bool (*)(int, int));
-REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
- int, int, int, int, int, int, int,
- bool (*)(int, int), bool (*)(int, int),
- const HSentenceVertices &, const HSentenceVertices &);
-REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
- int, int, int, int, int, int, int,
- bool (*)(int, int), bool (*)(int, int),
- const HSentenceVertices &, const HSentenceVertices &,
- const HSentenceVertices &, const HSentenceVertices &,
- REO_POS);
-
-void insertVertex(HSentenceVertices &, int, int);
-void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
- int, int, int, int);
-string getOrientString(REO_POS, REO_MODEL_TYPE);
-
-bool ge(int, int);
-bool le(int, int);
-bool lt(int, int);
-
-bool isAligned (SentenceAlignment &, int, int);
-
-int sentenceOffset = 0;
-
-std::vector<std::string> Tokenize(const std::string& str,
- const std::string& delimiters = " \t");
-
-bool flexScoreFlag = false;
-
-}
-
-namespace MosesTraining
-{
-
-class ExtractTask
-{
-public:
- ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFileOrientation)
- :m_sentence(sentence),
- m_options(initoptions),
- m_extractFileOrientation(extractFileOrientation)
- {}
- void Run();
-private:
- void extract(SentenceAlignment &);
- void addPhrase(SentenceAlignment &, int, int, int, int, string &);
- void writePhrasesToFile();
-
- SentenceAlignment &m_sentence;
- const PhraseExtractionOptions &m_options;
- Moses::OutputFileStream &m_extractFileOrientation;
-};
-}
-
-int main(int argc, char* argv[])
-{
- cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
- << "phrase extraction from an aligned parallel corpus\n";
-
- if (argc < 6) {
- cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
- cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
- exit(1);
- }
-
- Moses::OutputFileStream extractFileOrientation;
- const char* const &fileNameE = argv[1];
- const char* const &fileNameF = argv[2];
- const char* const &fileNameA = argv[3];
- const string fileNameExtract = string(argv[4]);
- PhraseExtractionOptions options(atoi(argv[5]));
-
- for(int i=6; i<argc; i++) {
- if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
- options.initOnlyOutputSpanInfo(true);
- } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
- options.initOrientationFlag(true);
- } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
- options.initFlexScoreFlag(true);
- } else if (strcmp(argv[i],"--NoTTable") == 0) {
- options.initTranslationFlag(false);
- } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
- options.initIncludeSentenceIdFlag(true);
- } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
- if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
- cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
- exit(1);
- }
- sentenceOffset = atoi(argv[++i]);
- } else if (strcmp(argv[i], "--GZOutput") == 0) {
- options.initGzOutput(true);
- } else if (strcmp(argv[i], "--InstanceWeights") == 0) {
- if (i+1 >= argc) {
- cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl;
- exit(1);
- }
- options.initInstanceWeightsFile(argv[++i]);
- } else if (strcmp(argv[i], "--Debug") == 0) {
- options.debug = true;
- } else if (strcmp(argv[i], "--MinPhraseLength") == 0) {
- options.minPhraseLength = atoi(argv[++i]);
- } else if (strcmp(argv[i], "--Separator") == 0) {
- options.separator = argv[++i];
- } else if(strcmp(argv[i],"--model") == 0) {
- if (i+1 >= argc) {
- cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
- exit(1);
- }
- char* modelParams = argv[++i];
- char* modelName = strtok(modelParams, "-");
- char* modelType = strtok(NULL, "-");
-
- // REO_MODEL_TYPE intModelType;
-
- if(strcmp(modelName, "wbe") == 0) {
- options.initWordModel(true);
- if(strcmp(modelType, "msd") == 0)
- options.initWordType(REO_MSD);
- else if(strcmp(modelType, "mslr") == 0)
- options.initWordType(REO_MSLR);
- else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
- options.initWordType(REO_MONO);
- else {
- cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
- exit(1);
- }
- } else if(strcmp(modelName, "phrase") == 0) {
- options.initPhraseModel(true);
- if(strcmp(modelType, "msd") == 0)
- options.initPhraseType(REO_MSD);
- else if(strcmp(modelType, "mslr") == 0)
- options.initPhraseType(REO_MSLR);
- else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
- options.initPhraseType(REO_MONO);
- else {
- cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
- exit(1);
- }
- } else if(strcmp(modelName, "hier") == 0) {
- options.initHierModel(true);
- if(strcmp(modelType, "msd") == 0)
- options.initHierType(REO_MSD);
- else if(strcmp(modelType, "mslr") == 0)
- options.initHierType(REO_MSLR);
- else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
- options.initHierType(REO_MONO);
- else {
- cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
- exit(1);
- }
- } else {
- cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
- exit(1);
- }
-
- options.initAllModelsOutputFlag(true);
- } else {
- cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
- exit(1);
- }
- }
-
- // default reordering model if no model selected
- // allows for the old syntax to be used
- if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
- options.initWordModel(true);
- options.initWordType(REO_MSD);
- }
-
- // open input files
- Moses::InputFileStream eFile(fileNameE);
- Moses::InputFileStream fFile(fileNameF);
- Moses::InputFileStream aFile(fileNameA);
-
- istream *eFileP = &eFile;
- istream *fFileP = &fFile;
- istream *aFileP = &aFile;
-
- istream *iwFileP = NULL;
- auto_ptr<Moses::InputFileStream> instanceWeightsFile;
- if (options.getInstanceWeightsFile().length()) {
- instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile()));
- iwFileP = instanceWeightsFile.get();
- }
-
- // open output files
- if (options.isOrientationFlag()) {
- string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
- extractFileOrientation.Open(fileNameExtractOrientation.c_str());
- }
-
- int i = sentenceOffset;
-
- while(true) {
- i++;
- if (i%10000 == 0) cerr << "." << flush;
- char englishString[LINE_MAX_LENGTH];
- char foreignString[LINE_MAX_LENGTH];
- char alignmentString[LINE_MAX_LENGTH];
- char weightString[LINE_MAX_LENGTH];
- SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
- if (eFileP->eof()) break;
- SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
- SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
- if (iwFileP) {
- SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
- }
- SentenceAlignment sentence;
- // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
- //az: output src, tgt, and alingment line
- if (options.isOnlyOutputSpanInfo()) {
- cout << "LOG: SRC: " << foreignString << endl;
- cout << "LOG: TGT: " << englishString << endl;
- cout << "LOG: ALT: " << alignmentString << endl;
- cout << "LOG: PHRASES_BEGIN:" << endl;
- }
- if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
- ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation);
- task->Run();
- delete task;
-
- }
- if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
- }
-
- eFile.Close();
- fFile.Close();
- aFile.Close();
-
- //az: only close if we actually opened it
- if (!options.isOnlyOutputSpanInfo()) {
- if (options.isOrientationFlag()) {
- extractFileOrientation.Close();
- }
- }
-}
-
-namespace MosesTraining
-{
-void ExtractTask::Run()
-{
- extract(m_sentence);
-}
-
-void ExtractTask::extract(SentenceAlignment &sentence)
-{
- int countE = sentence.target.size();
- int countF = sentence.source.size();
-
- HPhraseVector inboundPhrases;
-
- HSentenceVertices inTopLeft;
- HSentenceVertices inTopRight;
- HSentenceVertices inBottomLeft;
- HSentenceVertices inBottomRight;
-
- HSentenceVertices outTopLeft;
- HSentenceVertices outTopRight;
- HSentenceVertices outBottomLeft;
- HSentenceVertices outBottomRight;
-
- HSentenceVertices::const_iterator it;
-
- bool relaxLimit = m_options.isHierModel();
- bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
-
- // check alignments for target phrase startE...endE
- // loop over extracted phrases which are compatible with the word-alignments
- for(int startE=0; startE<countE; startE++) {
- for(int endE=startE;
- (endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
- endE++) {
-
- int minF = 9999;
- int maxF = -1;
- vector< int > usedF = sentence.alignedCountS;
- for(int ei=startE; ei<=endE; ei++) {
- for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
- int fi = sentence.alignedToT[ei][i];
- if (fi<minF) {
- minF = fi;
- }
- if (fi>maxF) {
- maxF = fi;
- }
- usedF[ fi ]--;
- }
- }
-
- if (maxF >= 0 && // aligned to any source words at all
- (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
-
- // check if source words are aligned to out of bound target words
- bool out_of_bounds = false;
- for(int fi=minF; fi<=maxF && !out_of_bounds; fi++)
- if (usedF[fi]>0) {
- // cout << "ouf of bounds: " << fi << "\n";
- out_of_bounds = true;
- }
-
- // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
- if (!out_of_bounds) {
- // start point of source phrase may retreat over unaligned
- for(int startF=minF;
- (startF>=0 &&
- (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
- (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
- startF--)
- // end point of source phrase may advance over unaligned
- for(int endF=maxF;
- (endF<countF &&
- (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
- (endF - startF + 1 > m_options.minPhraseLength) && // within length limit
- (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
- endF++) { // at this point we have extracted a phrase
- if(buildExtraStructure) { // phrase || hier
- if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
- inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
- HPhraseVertex(endF,endE)));
- insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
- startF, startE, endF, endE);
- } else
- insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
- startF, startE, endF, endE);
- } else {
- string orientationInfo = "";
- if(m_options.isWordModel()) {
- REO_POS wordPrevOrient, wordNextOrient;
- bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
- bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
- bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
- bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
- wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
- wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
- orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
- if(m_options.isAllModelsOutputFlag())
- " | | ";
- }
- addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
- }
- }
- }
- }
- }
- }
-
-
-}
-
-REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
- bool connectedLeftTop, bool connectedRightTop,
- int startF, int endF, int startE, int endE, int countF, int zero, int unit,
- bool (*ge)(int, int), bool (*lt)(int, int) )
-{
-
- if( connectedLeftTop && !connectedRightTop)
- return LEFT;
- if(modelType == REO_MONO)
- return UNKNOWN;
- if (!connectedLeftTop && connectedRightTop)
- return RIGHT;
- if(modelType == REO_MSD)
- return UNKNOWN;
- for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
- connectedLeftTop = isAligned(sentence, indexF, startE-unit);
- for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit)
- connectedRightTop = isAligned(sentence, indexF, startE-unit);
- if(connectedLeftTop && !connectedRightTop)
- return DRIGHT;
- else if(!connectedLeftTop && connectedRightTop)
- return DLEFT;
- return UNKNOWN;
-}
-
-// to be called with countF-1 instead of countF
-REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
- bool connectedLeftTop, bool connectedRightTop,
- int startF, int endF, int startE, int endE, int countF, int zero, int unit,
- bool (*ge)(int, int), bool (*lt)(int, int),
- const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft)
-{
-
- HSentenceVertices::const_iterator it;
-
- if((connectedLeftTop && !connectedRightTop) ||
- //(startE == 0 && startF == 0) ||
- //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) ||
- ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
- it->second.find(startF-unit) != it->second.end()))
- return LEFT;
- if(modelType == REO_MONO)
- return UNKNOWN;
- if((!connectedLeftTop && connectedRightTop) ||
- ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end()))
- return RIGHT;
- if(modelType == REO_MSD)
- return UNKNOWN;
- connectedLeftTop = false;
- for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
- if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
- it->second.find(indexF) != it->second.end())
- return DRIGHT;
- connectedRightTop = false;
- for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
- if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() &&
- it->second.find(indexF) != it->second.end())
- return DLEFT;
- return UNKNOWN;
-}
-
-// to be called with countF-1 instead of countF
-REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
- bool connectedLeftTop, bool connectedRightTop,
- int startF, int endF, int startE, int endE, int countF, int zero, int unit,
- bool (*ge)(int, int), bool (*lt)(int, int),
- const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft,
- const HSentenceVertices & outBottomRight, const HSentenceVertices & outBottomLeft,
- REO_POS phraseOrient)
-{
-
- HSentenceVertices::const_iterator it;
-
- if(phraseOrient == LEFT ||
- (connectedLeftTop && !connectedRightTop) ||
- // (startE == 0 && startF == 0) ||
- //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) ||
- ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
- it->second.find(startF-unit) != it->second.end()) ||
- ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
- it->second.find(startF-unit) != it->second.end()))
- return LEFT;
- if(modelType == REO_MONO)
- return UNKNOWN;
- if(phraseOrient == RIGHT ||
- (!connectedLeftTop && connectedRightTop) ||
- ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() &&
- it->second.find(endF + unit) != it->second.end()) ||
- ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() &&
- it->second.find(endF + unit) != it->second.end()))
- return RIGHT;
- if(modelType == REO_MSD)
- return UNKNOWN;
- if(phraseOrient != UNKNOWN)
- return phraseOrient;
- connectedLeftTop = false;
- for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) {
- if((connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
- it->second.find(indexF) != it->second.end()) ||
- (connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
- it->second.find(indexF) != it->second.end()))
- return DRIGHT;
- }
- connectedRightTop = false;
- for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) {
- if((connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() &&
- it->second.find(indexF) != it->second.end()) ||
- (connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() &&
- it->second.find(indexF) != it->second.end()))
- return DLEFT;
- }
- return UNKNOWN;
-}
-
-bool isAligned ( SentenceAlignment &sentence, int fi, int ei )
-{
- if (ei == -1 && fi == -1)
- return true;
- if (ei <= -1 || fi <= -1)
- return false;
- if ((size_t)ei == sentence.target.size() && (size_t)fi == sentence.source.size())
- return true;
- if ((size_t)ei >= sentence.target.size() || (size_t)fi >= sentence.source.size())
- return false;
- for(size_t i=0; i<sentence.alignedToT[ei].size(); i++)
- if (sentence.alignedToT[ei][i] == fi)
- return true;
- return false;
-}
-
-bool ge(int first, int second)
-{
- return first >= second;
-}
-
-bool le(int first, int second)
-{
- return first <= second;
-}
-
-bool lt(int first, int second)
-{
- return first < second;
-}
-
-void insertVertex( HSentenceVertices & corners, int x, int y )
-{
- set<int> tmp;
- tmp.insert(x);
- pair< HSentenceVertices::iterator, bool > ret = corners.insert( pair<int, set<int> > (y, tmp) );
- if(ret.second == false) {
- ret.first->second.insert(x);
- }
-}
-
-void insertPhraseVertices(
- HSentenceVertices & topLeft,
- HSentenceVertices & topRight,
- HSentenceVertices & bottomLeft,
- HSentenceVertices & bottomRight,
- int startF, int startE, int endF, int endE)
-{
-
- insertVertex(topLeft, startF, startE);
- insertVertex(topRight, endF, startE);
- insertVertex(bottomLeft, startF, endE);
- insertVertex(bottomRight, endF, endE);
-}
-
-string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
-{
- switch(orient) {
- case LEFT:
- return "mono";
- break;
- case RIGHT:
- return "swap";
- break;
- case DRIGHT:
- return "dright";
- break;
- case DLEFT:
- return "dleft";
- break;
- case UNKNOWN:
- switch(modelType) {
- case REO_MONO:
- return "nomono";
- break;
- case REO_MSD:
- return "other";
- break;
- case REO_MSLR:
- return "dright";
- break;
- }
- break;
- }
- return "";
-}
-
-int getClass(const std::string &str)
-{
- size_t pos = str.find("swap");
- if (pos == str.npos) {
- return 0;
- } else if (pos == 0) {
- return 1;
- } else {
- return 2;
- }
-}
-
-void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
-{
- if (m_options.isOnlyOutputSpanInfo()) {
- cout << startF << " " << endF << " " << startE << " " << endE << endl;
- return;
- }
-
- const string &sep = m_options.separator;
-
- m_extractFileOrientation << sentence.sentenceID << " " << sep << " ";
- m_extractFileOrientation << getClass(orientationInfo) << " " << sep << " ";
-
- // position
- m_extractFileOrientation << startF << " " << endF << " " << sep << " ";
-
- // start
- m_extractFileOrientation << "<s> ";
- for(int fi=0; fi<startF; fi++) {
- m_extractFileOrientation << sentence.source[fi] << " ";
- }
- m_extractFileOrientation << sep << " ";
-
- // middle
- for(int fi=startF; fi<=endF; fi++) {
- m_extractFileOrientation << sentence.source[fi] << " ";
- }
- m_extractFileOrientation << sep << " ";
-
- // end
- for(int fi=endF+1; fi<sentence.source.size(); fi++) {
- m_extractFileOrientation << sentence.source[fi] << " ";
- }
- m_extractFileOrientation << "</s> ";
-
-
- // target
- /*
- for(int ei=startE; ei<=endE; ei++) {
- m_extractFileOrientation << sentence.target[ei] << " ";
- }
- */
- m_extractFileOrientation << endl;
-}
-
-
-/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument.
- The separator can only be 1 character long. The default delimiters are space or tab
-*/
-std::vector<std::string> Tokenize(const std::string& str,
- const std::string& delimiters)
-{
- std::vector<std::string> tokens;
- // Skip delimiters at beginning.
- std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
- // Find first "non-delimiter".
- std::string::size_type pos = str.find_first_of(delimiters, lastPos);
-
- while (std::string::npos != pos || std::string::npos != lastPos) {
- // Found a token, add it to the vector.
- tokens.push_back(str.substr(lastPos, pos - lastPos));
- // Skip delimiters. Note the "not_of"
- lastPos = str.find_first_not_of(delimiters, pos);
- // Find next "non-delimiter"
- pos = str.find_first_of(delimiters, lastPos);
- }
-
- return tokens;
-}
-
-}
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index f5f44316e..50baa4e0d 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -29,6 +29,7 @@
#include <sstream>
#include <string>
#include <vector>
+#include <limits>
#ifdef WIN32
// Include Visual Leak Detector
@@ -39,7 +40,6 @@
#include "Hole.h"
#include "HoleCollection.h"
#include "RuleExist.h"
-#include "SafeGetline.h"
#include "SentenceAlignmentWithSyntax.h"
#include "SyntaxTree.h"
#include "tables-core.h"
@@ -47,8 +47,6 @@
#include "InputFileStream.h"
#include "OutputFileStream.h"
-#define LINE_MAX_LENGTH 500000
-
using namespace std;
using namespace MosesTraining;
@@ -326,17 +324,15 @@ int main(int argc, char* argv[])
// loop through all sentence pairs
size_t i=sentenceOffset;
- while(true) {
+ string targetString, sourceString, alignmentString;
+
+ while(getline(*tFileP, targetString)) {
i++;
- if (i%1000 == 0) cerr << i << " " << flush;
- char targetString[LINE_MAX_LENGTH];
- char sourceString[LINE_MAX_LENGTH];
- char alignmentString[LINE_MAX_LENGTH];
- SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__);
- if (tFileP->eof()) break;
- SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
- SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
+ getline(*sFileP, sourceString);
+ getline(*aFileP, alignmentString);
+
+ if (i%1000 == 0) cerr << i << " " << flush;
SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
@@ -349,7 +345,7 @@ int main(int argc, char* argv[])
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence.create(targetString, sourceString, alignmentString,"", i, options.boundaryRules)) {
+ if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) {
if (options.unknownWordLabelFlag) {
collectWordLabelCounts(sentence);
}
@@ -412,7 +408,7 @@ void ExtractTask::extractRules()
// find find aligned source words
// first: find minimum and maximum source word
- int minS = 9999;
+ int minS = std::numeric_limits<int>::max();
int maxS = -1;
vector< int > usedS = m_sentence.alignedCountS;
for(int ti=startT; ti<=endT; ti++) {
@@ -1123,8 +1119,8 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
ofstream grammarFile;
grammarFile.open(fileName.c_str());
if (!options.targetSyntax) {
- grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
- << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
+ grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| 0-0 ||| 0" << endl
+ << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 1-1 ||| 0" << endl
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
} else {
// chose a top label that is not already a label
@@ -1136,13 +1132,13 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
}
}
// basic rules
- grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| " << endl
- << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << endl;
+ grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| 0-0" << endl
+ << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1" << endl;
// top rules
for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
i != targetTopLabelCollection.end(); i++ ) {
- grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << endl;
+ grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2" << endl;
}
// glue rules
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
new file mode 100644
index 000000000..d62d599ec
--- /dev/null
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
@@ -0,0 +1,161 @@
+#include "FilterRuleTable.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include "util/string_piece.hh"
+#include "util/string_piece_hash.hh"
+#include "util/tokenize_piece.hh"
+
+#include "syntax-common/exception.h"
+#include "syntax-common/xml_tree_parser.h"
+
+#include "InputFileStream.h"
+
+#include "Options.h"
+#include "StringBasedFilter.h"
+#include "TreeBasedFilter.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+int FilterRuleTable::Main(int argc, char *argv[])
+{
+ // Process command-line options.
+ Options options;
+ ProcessOptions(argc, argv, options);
+
+ // Open input file.
+ Moses::InputFileStream testStream(options.testSetFile);
+
+ // Read the first test sentence and determine if it is a parse tree or a
+ // string.
+ std::string line;
+ if (!std::getline(testStream, line)) {
+ // TODO Error?
+ return 0;
+ }
+ if (line.find_first_of('<') == std::string::npos) {
+ // Test sentences are strings.
+ std::vector<std::vector<std::string> > sentences;
+ do {
+ sentences.resize(sentences.size()+1);
+ ReadTokens(line, sentences.back());
+ } while (std::getline(testStream, line));
+ StringBasedFilter filter(sentences);
+ filter.Filter(std::cin, std::cout);
+ } else {
+ // Test sentences are XML parse trees.
+ XmlTreeParser parser;
+ std::vector<boost::shared_ptr<StringTree> > sentences;
+ int lineNum = 1;
+ do {
+ if (line.size() == 0) {
+ std::cerr << "skipping blank test sentence at line " << lineNum
+ << std::endl;
+ continue;
+ }
+ sentences.push_back(boost::shared_ptr<StringTree>(parser.Parse(line)));
+ ++lineNum;
+ } while (std::getline(testStream, line));
+ TreeBasedFilter filter(sentences);
+ filter.Filter(std::cin, std::cout);
+ }
+
+ return 0;
+}
+
+void FilterRuleTable::ReadTokens(const std::string &s,
+ std::vector<std::string> &tokens)
+{
+ tokens.clear();
+// TODO
+}
+
+void FilterRuleTable::ProcessOptions(int argc, char *argv[],
+ Options &options) const
+{
+ namespace po = boost::program_options;
+ namespace cls = boost::program_options::command_line_style;
+
+ // Construct the 'top' of the usage message: the bit that comes before the
+ // options list.
+ std::ostringstream usageTop;
+ usageTop << "Usage: " << GetName()
+ << " [OPTION]... TEST\n\n"
+ << "Given a SCFG/STSG rule table (on standard input) and a set of test sentences,\nfilter out the rules that cannot be applied to any of the test sentences and\nwrite the filtered table to standard output.\n\n"
+ << "Options";
+
+ // Construct the 'bottom' of the usage message.
+ std::ostringstream usageBottom;
+ usageBottom << "TODO";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usageTop.str());
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options()
+ ("TestSetFile",
+ po::value(&options.testSetFile),
+ "test set file")
+ ;
+
+ // Compose the full set of command-line options.
+ po::options_description cmdLineOptions;
+ cmdLineOptions.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+ p.add("TestSetFile", 1);
+
+ // Process the command-line.
+ po::variables_map vm;
+ const int optionStyle = cls::allow_long
+ | cls::long_allow_adjacent
+ | cls::long_allow_next;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(optionStyle).
+ options(cmdLineOptions).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible << usageBottom.str();
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << usageBottom.str() << std::endl;
+ std::exit(0);
+ }
+
+ // Check all positional options were given.
+ if (!vm.count("TestSetFile")) {
+ std::ostringstream msg;
+ std::cerr << visible << usageBottom.str() << std::endl;
+ std::exit(1);
+ }
+}
+
+void FilterRuleTable::Error(const std::string &msg) const
+{
+ std::cerr << GetName() << ": " << msg << std::endl;
+ std::exit(1);
+}
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.h b/phrase-extract/filter-rule-table/FilterRuleTable.h
new file mode 100644
index 000000000..379f286da
--- /dev/null
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <vector>
+#include <string>
+
+#include <boost/shared_ptr.hpp>
+
+#include "syntax-common/string_tree.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+struct Options;
+
+class FilterRuleTable
+{
+public:
+ FilterRuleTable() : m_name("filter-rule-table") {}
+
+ const std::string &GetName() const {
+ return m_name;
+ }
+
+ int Main(int argc, char *argv[]);
+
+private:
+ void Error(const std::string &) const;
+
+ // Filter rule table (on std::cin) for test set (string version).
+ void Filter(const std::vector<std::vector<std::string> > &);
+
+ // Filter rule table (on std::cin) for test set (parse tree version).
+ void Filter(const std::vector<boost::shared_ptr<StringTree> > &);
+
+ void ProcessOptions(int, char *[], Options &) const;
+
+ void ReadTokens(const std::string &, std::vector<std::string> &);
+
+ std::string m_name;
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/Jamfile b/phrase-extract/filter-rule-table/Jamfile
new file mode 100644
index 000000000..b617706ab
--- /dev/null
+++ b/phrase-extract/filter-rule-table/Jamfile
@@ -0,0 +1 @@
+exe filter-rule-table : [ glob *.cpp ] ..//syntax-common ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;
diff --git a/phrase-extract/filter-rule-table/Main.cpp b/phrase-extract/filter-rule-table/Main.cpp
new file mode 100644
index 000000000..1a7e25016
--- /dev/null
+++ b/phrase-extract/filter-rule-table/Main.cpp
@@ -0,0 +1,7 @@
+#include "FilterRuleTable.h"
+
+int main(int argc, char *argv[])
+{
+ MosesTraining::Syntax::FilterRuleTable::FilterRuleTable tool;
+ return tool.Main(argc, argv);
+}
diff --git a/phrase-extract/filter-rule-table/Options.h b/phrase-extract/filter-rule-table/Options.h
new file mode 100644
index 000000000..0c86c1411
--- /dev/null
+++ b/phrase-extract/filter-rule-table/Options.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <string>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+struct Options {
+public:
+ Options() {}
+
+ // Positional options
+ std::string testSetFile;
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace Moses
diff --git a/phrase-extract/filter-rule-table/StringBasedFilter.cpp b/phrase-extract/filter-rule-table/StringBasedFilter.cpp
new file mode 100644
index 000000000..1d8b69698
--- /dev/null
+++ b/phrase-extract/filter-rule-table/StringBasedFilter.cpp
@@ -0,0 +1,27 @@
+#include "StringBasedFilter.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+StringBasedFilter::StringBasedFilter(
+ const std::vector<std::vector<std::string> > &sentences)
+{
+}
+
+void StringBasedFilter::Filter(std::istream &in, std::ostream &out)
+{
+ std::string line;
+ int lineNum = 0;
+ while (std::getline(in, line)) {
+ ++lineNum;
+ out << line << std::endl;
+ }
+}
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/StringBasedFilter.h b/phrase-extract/filter-rule-table/StringBasedFilter.h
new file mode 100644
index 000000000..e74e174eb
--- /dev/null
+++ b/phrase-extract/filter-rule-table/StringBasedFilter.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+class StringBasedFilter {
+ public:
+ StringBasedFilter(const std::vector<std::vector<std::string> > &);
+
+ void Filter(std::istream &, std::ostream &);
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/TreeBasedFilter.cpp b/phrase-extract/filter-rule-table/TreeBasedFilter.cpp
new file mode 100644
index 000000000..f53c2faa7
--- /dev/null
+++ b/phrase-extract/filter-rule-table/TreeBasedFilter.cpp
@@ -0,0 +1,243 @@
+#include "TreeBasedFilter.h"
+
+#include "boost/scoped_ptr.hpp"
+
+#include "util/string_piece.hh"
+#include "util/string_piece_hash.hh"
+#include "util/tokenize_piece.hh"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+TreeBasedFilter::TreeBasedFilter(
+ const std::vector<boost::shared_ptr<StringTree> > &sentences)
+{
+ // Convert each StringTree to an IdTree.
+ m_sentences.reserve(sentences.size());
+ for (std::vector<boost::shared_ptr<StringTree> >::const_iterator p =
+ sentences.begin(); p != sentences.end(); ++p) {
+ m_sentences.push_back(boost::shared_ptr<IdTree>(StringTreeToIdTree(**p)));
+ }
+
+ m_labelToTree.resize(m_testVocab.Size());
+ // Construct a map from root labels to IdTree nodes.
+ for (std::vector<boost::shared_ptr<IdTree> >::const_iterator p =
+ m_sentences.begin(); p != m_sentences.end(); ++p) {
+ AddNodesToMap(**p);
+ }
+}
+
+TreeBasedFilter::IdTree *TreeBasedFilter::StringTreeToIdTree(
+ const StringTree &s)
+{
+ IdTree *t = new IdTree(m_testVocab.Insert(s.value()));
+ const std::vector<StringTree*> &sChildren = s.children();
+ std::vector<IdTree*> &tChildren = t->children();
+ tChildren.reserve(sChildren.size());
+ for (std::vector<StringTree*>::const_iterator p = sChildren.begin();
+ p != sChildren.end(); ++p) {
+ IdTree *child = StringTreeToIdTree(**p);
+ child->parent() = t;
+ tChildren.push_back(child);
+ }
+ return t;
+}
+
+void TreeBasedFilter::AddNodesToMap(const IdTree &tree)
+{
+ m_labelToTree[tree.value()].push_back(&tree);
+ const std::vector<IdTree*> &children = tree.children();
+ for (std::vector<IdTree*>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ AddNodesToMap(**p);
+ }
+}
+
+void TreeBasedFilter::Filter(std::istream &in, std::ostream &out)
+{
+ const util::MultiCharacter delimiter("|||");
+
+ std::string line;
+ std::string prevLine;
+ StringPiece source;
+ bool keep;
+ int lineNum = 0;
+ std::vector<TreeFragmentToken> tokens;
+ std::vector<IdTree *> leaves;
+
+ while (std::getline(in, line)) {
+ ++lineNum;
+
+ // Read the source-side of the rule.
+ util::TokenIter<util::MultiCharacter> it(line, delimiter);
+
+ // Check if this rule has the same source-side as the previous rule. If
+ // it does then we already know whether or not to keep the rule. This
+ // optimisation is based on the assumption that the rule table is sorted
+ // (which is the case in the standard Moses training pipeline).
+ if (*it == source) {
+ if (keep) {
+ out << line << std::endl;
+ }
+ continue;
+ }
+
+ // The source-side is different from the previous rule's.
+ source = *it;
+
+ // Tokenize the source-side tree fragment.
+ tokens.clear();
+ for (TreeFragmentTokenizer p(source); p != TreeFragmentTokenizer(); ++p) {
+ tokens.push_back(*p);
+ }
+
+ // Construct an IdTree representing the source-side tree fragment. This
+ // will fail if the fragment contains any symbols that don't occur in
+ // m_testVocab and in that case the rule can be discarded. In practice,
+ // this catches a lot of discardable rules (see comment at the top of this
+ // function). If the fragment is successfully created then we attempt to
+ // match the tree fragment against the test trees. This test is exact, but
+ // slow.
+ int i = 0;
+ leaves.clear();
+ boost::scoped_ptr<IdTree> fragment(BuildTree(tokens, i, leaves));
+ keep = fragment.get() && MatchFragment(*fragment, leaves);
+ if (keep) {
+ out << line << std::endl;
+ }
+
+ // Retain line for the next iteration (in order that the source StringPiece
+ // remains valid).
+ prevLine.swap(line);
+ }
+}
+
+bool TreeBasedFilter::MatchFragment(const IdTree &fragment,
+ const std::vector<IdTree *> &leaves)
+{
+ typedef std::vector<const IdTree *> TreeVec;
+
+ // Determine which of the fragment's leaves has the smallest number of
+ // subtree matches in the test set. If the fragment contains a rare word
+ // (which is pretty likely assuming a Zipfian distribution) then we only
+ // have to try matching the fragment against a small number of potential
+ // match sites.
+ const IdTree *rarestLeaf = leaves[0];
+ std::size_t lowestCount = m_labelToTree[rarestLeaf->value()].size();
+ for (std::size_t i = 1; i < leaves.size(); ++i) {
+ const IdTree *leaf = leaves[i];
+ std::size_t count = m_labelToTree[leaf->value()].size();
+ if (count < lowestCount) {
+ lowestCount = count;
+ rarestLeaf = leaf;
+ }
+ }
+
+ // Determine the depth of the chosen leaf.
+ const std::size_t depth = rarestLeaf->Depth();
+
+ // Try to match the rule fragment against the test set subtrees where a
+ // leaf match was found.
+ TreeVec &nodes = m_labelToTree[rarestLeaf->value()];
+ for (TreeVec::const_iterator p = nodes.begin(); p != nodes.end(); ++p) {
+ // Navigate 'depth' positions up the subtree to find the root of the
+ // potential match site.
+ const IdTree *t = *p;
+ std::size_t d = depth;
+ while (d && t->parent()) {
+ t = t->parent();
+ --d;
+ }
+ if (d > 0) {
+ // The potential match site is not tall enough.
+ continue;
+ }
+ if (MatchFragment(fragment, *t)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+TreeBasedFilter::IdTree *TreeBasedFilter::BuildTree(
+ const std::vector<TreeFragmentToken> &tokens, int &i,
+ std::vector<IdTree *> &leaves)
+{
+ // The subtree starting at tokens[i] is either:
+ // 1. a single non-variable symbol (like NP or dog), or
+ // 2. a variable symbol (like [NP]), or
+ // 3. a subtree with children (like [NP [DT] [NN dog]])
+
+ // First check for case 1.
+ if (tokens[i].type == TreeFragmentToken_WORD) {
+ Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
+ StringPieceCompatibleHash(),
+ StringPieceCompatibleEquals());
+ if (id == Vocabulary::NullId()) {
+ return 0;
+ }
+ leaves.push_back(new IdTree(id));
+ return leaves.back();
+ }
+
+ // We must be dealing with either case 2 or 3. Case 2 looks like case 3 but
+ // without the children.
+ assert(tokens[i].type == TreeFragmentToken_LSB);
+
+ // Skip over the opening [
+ ++i;
+
+ // Read the root symbol of the subtree.
+ Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
+ StringPieceCompatibleHash(),
+ StringPieceCompatibleEquals());
+ if (id == Vocabulary::NullId()) {
+ return 0;
+ }
+ IdTree *root = new IdTree(id);
+
+ // Read the children (in case 2 there won't be any).
+ while (tokens[i].type != TreeFragmentToken_RSB) {
+ IdTree *child = BuildTree(tokens, i, leaves);
+ if (!child) {
+ delete root;
+ return 0;
+ }
+ root->children().push_back(child);
+ child->parent() = root;
+ }
+
+ if (root->IsLeaf()) {
+ leaves.push_back(root);
+ }
+
+ // Skip over the closing ] and we're done.
+ ++i;
+ return root;
+}
+
+bool TreeBasedFilter::MatchFragment(const IdTree &fragment, const IdTree &tree)
+{
+ if (fragment.value() != tree.value()) {
+ return false;
+ }
+ const std::vector<IdTree*> &fragChildren = fragment.children();
+ const std::vector<IdTree*> &treeChildren = tree.children();
+ if (!fragChildren.empty() && fragChildren.size() != treeChildren.size()) {
+ return false;
+ }
+ for (std::size_t i = 0; i < fragChildren.size(); ++i) {
+ if (!MatchFragment(*fragChildren[i], *treeChildren[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/TreeBasedFilter.h b/phrase-extract/filter-rule-table/TreeBasedFilter.h
new file mode 100644
index 000000000..e22731fe8
--- /dev/null
+++ b/phrase-extract/filter-rule-table/TreeBasedFilter.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_map.hpp>
+
+#include "syntax-common/numbered_set.h"
+#include "syntax-common/string_tree.h"
+#include "syntax-common/tree.h"
+#include "syntax-common/tree_fragment_tokenizer.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+// Filters a rule table (currently assumed to be tree-to-string, STSG),
+// discarding rules that cannot be applied to a given set of test sentences.
+class TreeBasedFilter {
+ public:
+ // Initialize the filter for a given set of test sentences.
+ TreeBasedFilter(const std::vector<boost::shared_ptr<StringTree> > &);
+
+ // Read a rule table from 'in' and filter it according to the test sentences.
+ // This is slow because it involves testing every rule (or a significant
+ // fraction) at every node of every test sentence parse tree. There are a
+ // couple of optimizations that speed things up in practice, but it could
+ // still use some work to make it faster.
+ //
+ // Some statistics from real data (WMT14, English-German):
+ //
+ // 4.4M Parallel sentences (source-side parsed with Berkeley parser)
+ // 2.7K Test sentences (newstest2014)
+ //
+ // 73.4M Original rule table size (number of distinct, composed GHKM rules)
+ // 22.9M Number of rules with same source-side as previous rule
+ // 50.5M Number of rules requiring vocabulary matching test
+ // 24.1M Number of rules requiring full tree matching test
+ // 6.7M Number of rules retained after filtering
+ //
+ void Filter(std::istream &in, std::ostream &out);
+
+ private:
+ // Maps source-side symbols (terminals and non-terminals) from strings to
+ // integers.
+ typedef NumberedSet<std::string, std::size_t> Vocabulary;
+
+ // Represents the test trees using their integer vocabulary values for faster
+ // matching.
+ typedef Tree<Vocabulary::IdType> IdTree;
+
+ // Add an entry to m_labelToTree for every subtree of the given tree.
+ void AddNodesToMap(const IdTree &);
+
+ // Build an IdTree (wrt m_testVocab) for the tree beginning at position i of
+ // the token sequence or return 0 if any symbol in the fragment is not in
+ // m_testVocab. If successful then on return, i will be set to the position
+ // immediately after the last token of the tree and leaves will contain the
+ // pointers to the fragment's leaves. If the build fails then i and leaves
+ // are undefined.
+ IdTree *BuildTree(const std::vector<TreeFragmentToken> &tokens, int &i,
+ std::vector<IdTree *> &leaves);
+
+ // Try to match a fragment against any test tree.
+ bool MatchFragment(const IdTree &, const std::vector<IdTree *> &);
+
+ // Try to match a fragment against a specific subtree of a test tree.
+ bool MatchFragment(const IdTree &, const IdTree &);
+
+ // Convert a StringTree to an IdTree (wrt m_testVocab). Inserts symbols into
+ // m_testVocab.
+ IdTree *StringTreeToIdTree(const StringTree &);
+
+ std::vector<boost::shared_ptr<IdTree> > m_sentences;
+ std::vector<std::vector<const IdTree *> > m_labelToTree;
+ Vocabulary m_testVocab;
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-common/Jamfile b/phrase-extract/pcfg-common/Jamfile
index b74b1071d..5669b443e 100644
--- a/phrase-extract/pcfg-common/Jamfile
+++ b/phrase-extract/pcfg-common/Jamfile
@@ -1 +1 @@
-lib pcfg_common : [ glob *.cc ] ..//deps : <include>.. ;
+lib pcfg_common : [ glob *.cc ] ..//syntax-common ..//deps : <include>.. ;
diff --git a/phrase-extract/pcfg-common/exception.h b/phrase-extract/pcfg-common/exception.h
deleted file mode 100644
index d9266ca36..000000000
--- a/phrase-extract/pcfg-common/exception.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_EXCEPTION_H_
-#define PCFG_EXCEPTION_H_
-
-#include <string>
-
-namespace Moses
-{
-namespace PCFG
-{
-
-class Exception
-{
-public:
- Exception(const char *msg) : msg_(msg) {}
- Exception(const std::string &msg) : msg_(msg) {}
- const std::string &msg() const {
- return msg_;
- }
-private:
- std::string msg_;
-};
-
-} // namespace PCFG
-} // namespace Moses
-
-#endif
diff --git a/phrase-extract/pcfg-common/numbered_set.h b/phrase-extract/pcfg-common/numbered_set.h
deleted file mode 100644
index 66e960404..000000000
--- a/phrase-extract/pcfg-common/numbered_set.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_NUMBERED_SET_H_
-#define PCFG_NUMBERED_SET_H_
-
-#include "exception.h"
-
-#include <boost/unordered_map.hpp>
-
-#include <limits>
-#include <sstream>
-#include <vector>
-
-namespace Moses
-{
-namespace PCFG
-{
-
-// Stores a set of elements of type T, each of which is allocated an integral
-// ID of type I. IDs are contiguous starting at 0. Individual elements cannot
-// be removed once inserted (but the whole set can be cleared).
-template<typename T, typename I=std::size_t>
-class NumberedSet
-{
-private:
- typedef boost::unordered_map<T, I> ElementToIdMap;
- typedef std::vector<const T *> IdToElementMap;
-
-public:
- typedef I IdType;
- typedef typename IdToElementMap::const_iterator const_iterator;
-
- NumberedSet() {}
-
- const_iterator begin() const {
- return id_to_element_.begin();
- }
- const_iterator end() const {
- return id_to_element_.end();
- }
-
- // Static value
- static I NullId() {
- return std::numeric_limits<I>::max();
- }
-
- bool Empty() const {
- return id_to_element_.empty();
- }
- std::size_t Size() const {
- return id_to_element_.size();
- }
-
- // Insert the given object and return its ID.
- I Insert(const T &);
-
- I Lookup(const T &) const;
- const T &Lookup(I) const;
-
- void Clear();
-
-private:
- ElementToIdMap element_to_id_;
- IdToElementMap id_to_element_;
-};
-
-template<typename T, typename I>
-I NumberedSet<T, I>::Lookup(const T &s) const
-{
- typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
- return (p == element_to_id_.end()) ? NullId() : p->second;
-}
-
-template<typename T, typename I>
-const T &NumberedSet<T, I>::Lookup(I id) const
-{
- if (id < 0 || id >= id_to_element_.size()) {
- std::ostringstream msg;
- msg << "Value not found: " << id;
- throw Exception(msg.str());
- }
- return *(id_to_element_[id]);
-}
-
-template<typename T, typename I>
-I NumberedSet<T, I>::Insert(const T &x)
-{
- std::pair<T, I> value(x, id_to_element_.size());
- std::pair<typename ElementToIdMap::iterator, bool> result =
- element_to_id_.insert(value);
- if (result.second) {
- // x is a new element.
- id_to_element_.push_back(&result.first->first);
- }
- return result.first->second;
-}
-
-template<typename T, typename I>
-void NumberedSet<T, I>::Clear()
-{
- element_to_id_.clear();
- id_to_element_.clear();
-}
-
-} // namespace PCFG
-} // namespace Moses
-
-#endif
diff --git a/phrase-extract/pcfg-common/pcfg.cc b/phrase-extract/pcfg-common/pcfg.cc
index 054e20a48..cae6d4763 100644
--- a/phrase-extract/pcfg-common/pcfg.cc
+++ b/phrase-extract/pcfg-common/pcfg.cc
@@ -19,14 +19,15 @@
#include "pcfg.h"
-#include "exception.h"
+#include <cassert>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
-#include <cassert>
+#include "syntax-common/exception.h"
-namespace Moses {
+namespace MosesTraining {
+namespace Syntax {
namespace PCFG {
void Pcfg::Add(const Key &key, double score) {
@@ -103,4 +104,5 @@ void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const {
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-common/pcfg.h b/phrase-extract/pcfg-common/pcfg.h
index 5398cd97e..c5c04cba4 100644
--- a/phrase-extract/pcfg-common/pcfg.h
+++ b/phrase-extract/pcfg-common/pcfg.h
@@ -21,21 +21,19 @@
#ifndef PCFG_PCFG_H_
#define PCFG_PCFG_H_
-#include "typedef.h"
-
#include <istream>
#include <map>
#include <ostream>
#include <vector>
-namespace Moses
-{
-namespace PCFG
-{
+#include "typedef.h"
+
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
-class Pcfg
-{
-public:
+class Pcfg {
+ public:
typedef std::vector<std::size_t> Key;
typedef std::map<Key, double> Map;
typedef Map::iterator iterator;
@@ -43,30 +41,23 @@ public:
Pcfg() {}
- iterator begin() {
- return rules_.begin();
- }
- const_iterator begin() const {
- return rules_.begin();
- }
+ iterator begin() { return rules_.begin(); }
+ const_iterator begin() const { return rules_.begin(); }
- iterator end() {
- return rules_.end();
- }
- const_iterator end() const {
- return rules_.end();
- }
+ iterator end() { return rules_.end(); }
+ const_iterator end() const { return rules_.end(); }
void Add(const Key &, double);
bool Lookup(const Key &, double &) const;
void Read(std::istream &, Vocabulary &);
void Write(const Vocabulary &, std::ostream &) const;
-private:
+ private:
Map rules_;
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h
index d125cad16..ce28eb8dd 100644
--- a/phrase-extract/pcfg-common/pcfg_tree.h
+++ b/phrase-extract/pcfg-common/pcfg_tree.h
@@ -21,48 +21,40 @@
#ifndef PCFG_PCFG_TREE_H_
#define PCFG_PCFG_TREE_H_
+#include <string>
+
#include "syntax_tree.h"
#include "xml_tree_writer.h"
-#include <string>
-
-namespace Moses
-{
-namespace PCFG
-{
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
template<typename DerivedType>
-class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType>
-{
-public:
+class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
+ public:
typedef std::string LabelType;
typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
- double score() const {
- return score_;
- }
- void set_score(double s) {
- score_ = s;
- }
+ double score() const { return score_; }
+ void set_score(double s) { score_ = s; }
-private:
+ private:
double score_;
};
-class PcfgTree : public PcfgTreeBase<PcfgTree>
-{
-public:
+class PcfgTree : public PcfgTreeBase<PcfgTree> {
+ public:
typedef PcfgTreeBase<PcfgTree> BaseType;
PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
};
// Specialise XmlOutputHandler for PcfgTree.
template<>
-class XmlOutputHandler<PcfgTree>
-{
-public:
+class XmlOutputHandler<PcfgTree> {
+ public:
typedef std::map<std::string, std::string> AttributeMap;
void GetLabel(const PcfgTree &tree, std::string &label) const {
@@ -81,6 +73,7 @@ public:
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h
index 93d9dbec9..c0c6eaef9 100644
--- a/phrase-extract/pcfg-common/syntax_tree.h
+++ b/phrase-extract/pcfg-common/syntax_tree.h
@@ -24,16 +24,14 @@
#include <cassert>
#include <vector>
-namespace Moses
-{
-namespace PCFG
-{
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
// Base class for SyntaxTree, AgreementTree, and friends.
template<typename T, typename DerivedType>
-class SyntaxTreeBase
-{
-public:
+class SyntaxTreeBase {
+ public:
// Constructors
SyntaxTreeBase(const T &label)
: label_(label)
@@ -48,54 +46,33 @@ public:
// Destructor
virtual ~SyntaxTreeBase();
- const T &label() const {
- return label_;
- }
- const DerivedType *parent() const {
- return parent_;
- }
- DerivedType *parent() {
- return parent_;
- }
- const std::vector<DerivedType *> &children() const {
- return children_;
- }
- std::vector<DerivedType *> &children() {
- return children_;
- }
+ const T &label() const { return label_; }
+ const DerivedType *parent() const { return parent_; }
+ DerivedType *parent() { return parent_; }
+ const std::vector<DerivedType *> &children() const { return children_; }
+ std::vector<DerivedType *> &children() { return children_; }
- void set_label(const T &label) {
- label_ = label;
- }
- void set_parent(DerivedType *parent) {
- parent_ = parent;
- }
- void set_children(const std::vector<DerivedType *> &c) {
- children_ = c;
- }
+ void set_label(const T &label) { label_ = label; }
+ void set_parent(DerivedType *parent) { parent_ = parent; }
+ void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
- bool IsLeaf() const {
- return children_.empty();
- }
+ bool IsLeaf() const { return children_.empty(); }
bool IsPreterminal() const {
return children_.size() == 1 && children_[0]->IsLeaf();
}
- void AddChild(DerivedType *child) {
- children_.push_back(child);
- }
+ void AddChild(DerivedType *child) { children_.push_back(child); }
-private:
+ private:
T label_;
std::vector<DerivedType *> children_;
DerivedType *parent_;
};
template<typename T>
-class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> >
-{
-public:
+class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
+ public:
typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
SyntaxTree(const T &label) : BaseType(label) {}
SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
@@ -103,14 +80,14 @@ public:
};
template<typename T, typename DerivedType>
-SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase()
-{
+SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
for (std::size_t i = 0; i < children_.size(); ++i) {
delete children_[i];
}
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-common/tool.cc b/phrase-extract/pcfg-common/tool.cc
index bebd220e1..f54e07a12 100644
--- a/phrase-extract/pcfg-common/tool.cc
+++ b/phrase-extract/pcfg-common/tool.cc
@@ -21,7 +21,8 @@
#include <sstream>
-namespace Moses {
+namespace MosesTraining {
+namespace Syntax {
namespace PCFG {
std::istream &Tool::OpenInputOrDie(const std::string &filename) {
@@ -77,4 +78,5 @@ void Tool::OpenNamedOutputOrDie(const std::string &filename,
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-common/tool.h b/phrase-extract/pcfg-common/tool.h
index aada036e3..2c903a11e 100644
--- a/phrase-extract/pcfg-common/tool.h
+++ b/phrase-extract/pcfg-common/tool.h
@@ -21,30 +21,26 @@
#ifndef PCFG_TOOL_H_
#define PCFG_TOOL_H_
-#include <boost/program_options/cmdline.hpp>
-
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
-namespace Moses
-{
-namespace PCFG
-{
+#include <boost/program_options/cmdline.hpp>
-class Tool
-{
-public:
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
+
+class Tool {
+ public:
virtual ~Tool() {}
- const std::string &name() const {
- return name_;
- }
+ const std::string &name() const { return name_; }
virtual int Main(int argc, char *argv[]) = 0;
-protected:
+ protected:
Tool(const std::string &name) : name_(name) {}
// Returns the boost::program_options style that should be used by all tools.
@@ -82,7 +78,7 @@ protected:
// the file cannot be opened for writing.
void OpenNamedOutputOrDie(const std::string &, std::ofstream &);
-private:
+ private:
std::string name_;
std::istream *input_ptr_;
std::ifstream input_file_stream_;
@@ -91,6 +87,7 @@ private:
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h
index ce3e0423b..e738163df 100644
--- a/phrase-extract/pcfg-common/typedef.h
+++ b/phrase-extract/pcfg-common/typedef.h
@@ -21,19 +21,19 @@
#ifndef PCFG_TYPEDEF_H_
#define PCFG_TYPEDEF_H_
-#include "numbered_set.h"
-#include "syntax_tree.h"
-
#include <string>
-namespace Moses
-{
-namespace PCFG
-{
+#include "syntax-common/numbered_set.h"
+#include "syntax_tree.h"
+
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
typedef NumberedSet<std::string> Vocabulary;
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc
index b6c1da177..3d9291994 100644
--- a/phrase-extract/pcfg-common/xml_tree_parser.cc
+++ b/phrase-extract/pcfg-common/xml_tree_parser.cc
@@ -19,25 +19,23 @@
#include "xml_tree_parser.h"
-#include "exception.h"
+#include <cassert>
+#include <vector>
+
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
-#include <cassert>
-#include <vector>
-
-using namespace MosesTraining;
+#include "syntax-common/exception.h"
-namespace Moses {
+namespace MosesTraining {
+namespace Syntax {
namespace PCFG {
-XmlTreeParser::XmlTreeParser()
-{
+XmlTreeParser::XmlTreeParser() {
}
-std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line)
-{
+std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
m_line = line;
m_tree.Clear();
try {
@@ -60,8 +58,7 @@ std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line)
// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
const SyntaxNode &tree,
- const std::vector<std::string> &words)
-{
+ const std::vector<std::string> &words) {
std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
const std::vector<SyntaxNode*> &children = tree.GetChildren();
if (children.empty()) {
@@ -87,4 +84,5 @@ std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h
index 7eec14033..675a112d8 100644
--- a/phrase-extract/pcfg-common/xml_tree_parser.h
+++ b/phrase-extract/pcfg-common/xml_tree_parser.h
@@ -21,28 +21,26 @@
#ifndef PCFG_XML_TREE_PARSER_H_
#define PCFG_XML_TREE_PARSER_H_
-#include "pcfg_tree.h"
-#include "SyntaxTree.h"
-
#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>
-namespace Moses
-{
-namespace PCFG
-{
+#include "pcfg_tree.h"
+#include "SyntaxTree.h"
+
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
// Parses a string in Moses' XML parse tree format and returns a PcfgTree
// object.
-class XmlTreeParser
-{
-public:
+class XmlTreeParser {
+ public:
XmlTreeParser();
std::auto_ptr<PcfgTree> Parse(const std::string &);
-private:
+ private:
std::auto_ptr<PcfgTree> ConvertTree(const MosesTraining::SyntaxNode &,
const std::vector<std::string> &);
@@ -54,6 +52,7 @@ private:
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h
index e09942279..8582e544f 100644
--- a/phrase-extract/pcfg-common/xml_tree_writer.h
+++ b/phrase-extract/pcfg-common/xml_tree_writer.h
@@ -21,10 +21,6 @@
#ifndef PCFG_XML_TREE_WRITER_H_
#define PCFG_XML_TREE_WRITER_H_
-#include "syntax_tree.h"
-
-#include "XmlTree.h"
-
#include <cassert>
#include <map>
#include <memory>
@@ -32,15 +28,17 @@
#include <vector>
#include <string>
-namespace Moses
-{
-namespace PCFG
-{
+#include "XmlTree.h"
+
+#include "syntax_tree.h"
+
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
template<typename InputTree>
-class XmlOutputHandler
-{
-public:
+class XmlOutputHandler {
+ public:
typedef std::map<std::string, std::string> AttributeMap;
void GetLabel(const InputTree &, std::string &) const;
@@ -48,19 +46,17 @@ public:
};
template<typename InputTree>
-class XmlTreeWriter : public XmlOutputHandler<InputTree>
-{
-public:
+class XmlTreeWriter : public XmlOutputHandler<InputTree> {
+ public:
typedef XmlOutputHandler<InputTree> Base;
void Write(const InputTree &, std::ostream &) const;
-private:
+ private:
std::string Escape(const std::string &) const;
};
template<typename InputTree>
void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
- std::ostream &out) const
-{
+ std::ostream &out) const {
assert(!tree.IsLeaf());
// Opening tag
@@ -104,8 +100,7 @@ void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
// Escapes XML special characters.
template<typename InputTree>
-std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const
-{
+std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
std::string t;
std::size_t len = s.size();
t.reserve(len);
@@ -134,6 +129,7 @@ std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-extract/main.cc b/phrase-extract/pcfg-extract/main.cc
index 47b45afc3..84051f2e2 100644
--- a/phrase-extract/pcfg-extract/main.cc
+++ b/phrase-extract/pcfg-extract/main.cc
@@ -20,6 +20,6 @@
#include "pcfg_extract.h"
int main(int argc, char *argv[]) {
- Moses::PCFG::PcfgExtract tool;
+ MosesTraining::Syntax::PCFG::PcfgExtract tool;
return tool.Main(argc, argv);
}
diff --git a/phrase-extract/pcfg-extract/options.h b/phrase-extract/pcfg-extract/options.h
index 2633f025a..ffaa3bb17 100644
--- a/phrase-extract/pcfg-extract/options.h
+++ b/phrase-extract/pcfg-extract/options.h
@@ -23,16 +23,16 @@
#include <string>
-namespace Moses
-{
-namespace PCFG
-{
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
struct Options {
std::string corpus_file;
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
index 71c2e31c3..a5e06aa82 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.cc
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -23,7 +23,8 @@
#include "rule_collection.h"
#include "rule_extractor.h"
-#include "pcfg-common/exception.h"
+#include "syntax-common/exception.h"
+
#include "pcfg-common/pcfg.h"
#include "pcfg-common/pcfg_tree.h"
#include "pcfg-common/syntax_tree.h"
@@ -42,7 +43,8 @@
#include <string>
#include <vector>
-namespace Moses {
+namespace MosesTraining {
+namespace Syntax {
namespace PCFG {
int PcfgExtract::Main(int argc, char *argv[]) {
@@ -128,4 +130,5 @@ void PcfgExtract::ProcessOptions(int argc, char *argv[],
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.h b/phrase-extract/pcfg-extract/pcfg_extract.h
index e8c306876..835564341 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.h
+++ b/phrase-extract/pcfg-extract/pcfg_extract.h
@@ -23,15 +23,13 @@
#include "pcfg-common/tool.h"
-namespace Moses
-{
-namespace PCFG
-{
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
-class Options;
+struct Options;
-class PcfgExtract : public Tool
-{
+class PcfgExtract : public Tool {
public:
PcfgExtract() : Tool("pcfg-extract") {}
virtual int Main(int, char *[]);
@@ -40,6 +38,7 @@ private:
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-extract/rule_collection.cc b/phrase-extract/pcfg-extract/rule_collection.cc
index 32b63e0ef..21e84d2fa 100644
--- a/phrase-extract/pcfg-extract/rule_collection.cc
+++ b/phrase-extract/pcfg-extract/rule_collection.cc
@@ -23,7 +23,8 @@
#include <cmath>
-namespace Moses {
+namespace MosesTraining {
+namespace Syntax {
namespace PCFG {
void RuleCollection::Add(std::size_t lhs, const std::vector<std::size_t> &rhs) {
@@ -55,4 +56,5 @@ void RuleCollection::CreatePcfg(Pcfg &pcfg) {
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-extract/rule_collection.h b/phrase-extract/pcfg-extract/rule_collection.h
index 32cb2dc05..66fa98657 100644
--- a/phrase-extract/pcfg-extract/rule_collection.h
+++ b/phrase-extract/pcfg-extract/rule_collection.h
@@ -21,21 +21,19 @@
#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_
#define PCFG_EXTRACT_RULE_COLLECTION_H_
-#include "pcfg-common/pcfg.h"
+#include <vector>
#include <boost/unordered_map.hpp>
-#include <vector>
+#include "pcfg-common/pcfg.h"
-namespace Moses
-{
-namespace PCFG
-{
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
// Contains PCFG rules and their counts.
-class RuleCollection
-{
-public:
+class RuleCollection {
+ public:
typedef boost::unordered_map<std::vector<std::size_t>, std::size_t> RhsCountMap;
typedef boost::unordered_map<std::size_t, RhsCountMap> Map;
typedef Map::iterator iterator;
@@ -43,28 +41,21 @@ public:
RuleCollection() {}
- iterator begin() {
- return collection_.begin();
- }
- const_iterator begin() const {
- return collection_.begin();
- }
+ iterator begin() { return collection_.begin(); }
+ const_iterator begin() const { return collection_.begin(); }
- iterator end() {
- return collection_.end();
- }
- const_iterator end() const {
- return collection_.end();
- }
+ iterator end() { return collection_.end(); }
+ const_iterator end() const { return collection_.end(); }
void Add(std::size_t, const std::vector<std::size_t> &);
void CreatePcfg(Pcfg &);
-private:
+ private:
Map collection_;
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Synatx
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc
index 217574e7d..bb4698fae 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.cc
+++ b/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -21,7 +21,8 @@
#include "pcfg-common/pcfg_tree.h"
-namespace Moses {
+namespace MosesTraining {
+namespace Syntax {
namespace PCFG {
RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
@@ -48,4 +49,5 @@ void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const {
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h
index e4b411c01..1dddd796f 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.h
+++ b/phrase-extract/pcfg-extract/rule_extractor.h
@@ -21,28 +21,27 @@
#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
-#include "rule_collection.h"
-
#include "pcfg-common/typedef.h"
-namespace Moses
-{
-namespace PCFG
-{
+#include "rule_collection.h"
+
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
class PcfgTree;
// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
-class RuleExtractor
-{
-public:
+class RuleExtractor {
+ public:
RuleExtractor(Vocabulary &);
void Extract(const PcfgTree &, RuleCollection &) const;
-private:
+ private:
Vocabulary &non_term_vocab_;
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-score/main.cc b/phrase-extract/pcfg-score/main.cc
index da5392add..5ce19f797 100644
--- a/phrase-extract/pcfg-score/main.cc
+++ b/phrase-extract/pcfg-score/main.cc
@@ -20,6 +20,6 @@
#include "pcfg_score.h"
int main(int argc, char *argv[]) {
- Moses::PCFG::PcfgScore tool;
+ MosesTraining::Syntax::PCFG::PcfgScore tool;
return tool.Main(argc, argv);
}
diff --git a/phrase-extract/pcfg-score/options.h b/phrase-extract/pcfg-score/options.h
index fd54b4b6b..bbd56d6d0 100644
--- a/phrase-extract/pcfg-score/options.h
+++ b/phrase-extract/pcfg-score/options.h
@@ -23,16 +23,16 @@
#include <string>
-namespace Moses
-{
-namespace PCFG
-{
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
struct Options {
std::string pcfg_file;
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
index 345d7fc60..a561c18ed 100644
--- a/phrase-extract/pcfg-score/pcfg_score.cc
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -19,18 +19,6 @@
#include "pcfg_score.h"
-#include "options.h"
-#include "tree_scorer.h"
-
-#include "pcfg-common/exception.h"
-#include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
-#include "pcfg-common/syntax_tree.h"
-#include "pcfg-common/typedef.h"
-#include "pcfg-common/xml_tree_parser.h"
-
-#include <boost/program_options.hpp>
-
#include <cassert>
#include <cstdlib>
#include <fstream>
@@ -40,8 +28,21 @@
#include <set>
#include <string>
#include <vector>
+#include "options.h"
+#include "tree_scorer.h"
+
+#include <boost/program_options.hpp>
+
+#include "syntax-common/exception.h"
+
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
-namespace Moses {
+namespace MosesTraining {
+namespace Syntax {
namespace PCFG {
int PcfgScore::Main(int argc, char *argv[]) {
@@ -149,4 +150,5 @@ void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const {
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-score/pcfg_score.h b/phrase-extract/pcfg-score/pcfg_score.h
index f49c9a0be..fb9971c35 100644
--- a/phrase-extract/pcfg-score/pcfg_score.h
+++ b/phrase-extract/pcfg-score/pcfg_score.h
@@ -23,23 +23,22 @@
#include "pcfg-common/tool.h"
-namespace Moses
-{
-namespace PCFG
-{
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
-class Options;
+struct Options;
-class PcfgScore : public Tool
-{
-public:
+class PcfgScore : public Tool {
+ public:
PcfgScore() : Tool("pcfg-score") {}
virtual int Main(int, char *[]);
-private:
+ private:
void ProcessOptions(int, char *[], Options &) const;
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc
index f9ce97ae0..53b6aaccf 100644
--- a/phrase-extract/pcfg-score/tree_scorer.cc
+++ b/phrase-extract/pcfg-score/tree_scorer.cc
@@ -21,7 +21,8 @@
#include <cassert>
-namespace Moses {
+namespace MosesTraining {
+namespace Syntax {
namespace PCFG {
TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
@@ -65,4 +66,5 @@ bool TreeScorer::Score(PcfgTree &root) const {
}
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h
index 8cb59c0c2..3cf4693a6 100644
--- a/phrase-extract/pcfg-score/tree_scorer.h
+++ b/phrase-extract/pcfg-score/tree_scorer.h
@@ -25,26 +25,25 @@
#include "pcfg-common/pcfg_tree.h"
#include "pcfg-common/typedef.h"
-namespace Moses
-{
-namespace PCFG
-{
-
-class TreeScorer
-{
-public:
+namespace MosesTraining {
+namespace Syntax {
+namespace PCFG {
+
+class TreeScorer {
+ public:
TreeScorer(const Pcfg &, const Vocabulary &);
// Score tree according to PCFG. Returns false if unsuccessful (due to
// missing rule).
bool Score(PcfgTree &) const;
-private:
+ private:
const Pcfg &pcfg_;
const Vocabulary &non_term_vocab_;
};
} // namespace PCFG
-} // namespace Moses
+} // namespace Syntax
+} // namespace MosesTraining
#endif
diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index d3cb4c527..8188d70ec 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -20,8 +20,6 @@
***********************************************************************/
#include "relax-parse.h"
-
-#include "SafeGetline.h"
#include "tables-core.h"
using namespace std;
@@ -33,19 +31,15 @@ int main(int argc, char* argv[])
// loop through all sentences
int i=0;
- char inBuffer[LINE_MAX_LENGTH];
- while(true) {
+ string inBufferString;
+ while(cin.peek() != EOF) {
+ getline(cin,inBufferString);
i++;
if (i%1000 == 0) cerr << "." << flush;
if (i%10000 == 0) cerr << ":" << flush;
if (i%100000 == 0) cerr << "!" << flush;
- // get line from stdin
- SAFE_GETLINE( cin, inBuffer, LINE_MAX_LENGTH, '\n', __FILE__);
- if (cin.eof()) break;
-
// process into syntax tree representation
- string inBufferString = string( inBuffer );
set< string > labelCollection; // set of labels, not used
map< string, int > topLabelCollection; // count of top labels, not used
SyntaxTree tree;
@@ -83,7 +77,7 @@ void init(int argc, char* argv[])
if (argc < 2) {
cerr << "syntax: relax-parse < in-parse > out-parse ["
- << " --LeftBinarize | ---RightBinarize |"
+ << " --LeftBinarize | --RightBinarize |"
<< " --SAMT 1-4 ]" << endl;
exit(1);
}
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 69c2aff48..2a8413c48 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -28,8 +28,8 @@
#include <set>
#include <vector>
#include <algorithm>
+#include <boost/unordered_map.hpp>
-#include "SafeGetline.h"
#include "ScoreFeature.h"
#include "tables-core.h"
#include "ExtractionPhrasePair.h"
@@ -40,15 +40,18 @@
using namespace std;
using namespace MosesTraining;
-#define LINE_MAX_LENGTH 100000
-
namespace MosesTraining
{
LexicalTable lexTable;
bool inverseFlag = false;
bool hierarchicalFlag = false;
bool pcfgFlag = false;
+bool phraseOrientationFlag = false;
bool treeFragmentsFlag = false;
+bool sourceSyntaxLabelsFlag = false;
+bool sourceSyntaxLabelSetFlag = false;
+bool sourceSyntaxLabelCountsLHSFlag = false;
+bool targetPreferenceLabelsFlag = false;
bool unpairedExtractFormatFlag = false;
bool conditionOnTargetLhsFlag = false;
bool wordAlignmentFlag = true;
@@ -61,16 +64,29 @@ bool lexFlag = true;
bool unalignedFlag = false;
bool unalignedFWFlag = false;
bool crossedNonTerm = false;
+bool spanLength = false;
+bool nonTermContext = false;
+
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
float minCountHierarchical = 0;
-std::map<std::string,float> sourceLHSCounts;
-std::map<std::string, std::map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
+bool phraseOrientationPriorsFlag = false;
+boost::unordered_map<std::string,float> sourceLHSCounts;
+boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
std::set<std::string> sourceLabelSet;
-std::map<std::string,size_t> sourceLabels;
+std::map<std::string,size_t> sourceLabels;
std::vector<std::string> sourceLabelsByIndex;
+boost::unordered_map<std::string,float> targetPreferenceLHSCounts;
+boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetPreferenceLHSJointCounts;
+std::set<std::string> targetPreferenceLabelSet;
+std::map<std::string,size_t> targetPreferenceLabels;
+std::vector<std::string> targetPreferenceLabelsByIndex;
+
+std::vector<float> orientationClassPriorsL2R(4,0); // mono swap dright dleft
+std::vector<float> orientationClassPriorsR2L(4,0); // mono swap dright dleft
+
Vocabulary vcbT;
Vocabulary vcbS;
@@ -79,17 +95,23 @@ Vocabulary vcbS;
std::vector<std::string> tokenize( const char [] );
void processLine( std::string line,
- int lineID, bool includeSentenceIdFlag, int &sentenceId,
+ int lineID, bool includeSentenceIdFlag, int &sentenceId,
PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
std::string &additionalPropertiesString,
float &count, float &pcfgSum );
void writeCountOfCounts( const std::string &fileNameCountOfCounts );
-void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
+void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
+ const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
+ const std::string &fileNameLeftHandSideSourceLabelCounts,
+ const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
+void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
+void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
set<std::string> functionWordList;
+void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, std::vector<float> &orientationClassPriorsL2R, std::vector<float> &orientationClassPriorsR2L);
void loadFunctionWords( const string &fileNameFunctionWords );
double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
@@ -100,20 +122,27 @@ void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, co
int main(int argc, char* argv[])
{
- std::cerr << "Score v2.1 -- "
+ std::cerr << "Score v2.1 -- "
<< "scoring methods for extracted rules" << std::endl;
ScoreFeatureManager featureManager;
if (argc < 4) {
- std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
+ std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelSet] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
std::cerr << featureManager.usage() << std::endl;
exit(1);
}
std::string fileNameExtract = argv[1];
std::string fileNameLex = argv[2];
std::string fileNamePhraseTable = argv[3];
+ std::string fileNameSourceLabelSet;
std::string fileNameCountOfCounts;
std::string fileNameFunctionWords;
+ std::string fileNameLeftHandSideSourceLabelCounts;
+ std::string fileNameLeftHandSideTargetSourceLabelCounts;
+ std::string fileNameTargetPreferenceLabelSet;
+ std::string fileNameLeftHandSideTargetPreferenceLabelCounts;
+ std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts;
+ std::string fileNamePhraseOrientationPriors;
std::vector<std::string> featureArgs; // all unknown args passed to feature manager
for(int i=4; i<argc; i++) {
@@ -126,9 +155,32 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--PCFG") == 0) {
pcfgFlag = true;
std::cerr << "including PCFG scores" << std::endl;
+ } else if (strcmp(argv[i],"--PhraseOrientation") == 0) {
+ phraseOrientationFlag = true;
+ std::cerr << "including phrase orientation information" << std::endl;
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
treeFragmentsFlag = true;
- std::cerr << "including tree fragment information from syntactic parse\n";
+ std::cerr << "including tree fragment information from syntactic parse" << std::endl;
+ } else if (strcmp(argv[i],"--SourceLabels") == 0) {
+ sourceSyntaxLabelsFlag = true;
+ std::cerr << "including source label information" << std::endl;
+ } else if (strcmp(argv[i],"--SourceLabelSet") == 0) {
+ sourceSyntaxLabelSetFlag = true;
+ fileNameSourceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.src";
+ std::cerr << "writing source syntax label set to file " << fileNameSourceLabelSet << std::endl;
+ } else if (strcmp(argv[i],"--SourceLabelCountsLHS") == 0) {
+ sourceSyntaxLabelCountsLHSFlag = true;
+ fileNameLeftHandSideSourceLabelCounts = std::string(fileNamePhraseTable) + ".src.lhs";
+ fileNameLeftHandSideTargetSourceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-src.lhs";
+ std::cerr << "counting left-hand side source labels and writing them to files " << fileNameLeftHandSideSourceLabelCounts << " and " << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
+ } else if (strcmp(argv[i],"--TargetPreferenceLabels") == 0) {
+ targetPreferenceLabelsFlag = true;
+ std::cerr << "including target preference label information" << std::endl;
+ fileNameTargetPreferenceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
+ std::cerr << "writing target preference label set to file " << fileNameTargetPreferenceLabelSet << std::endl;
+ fileNameLeftHandSideTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
+ fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
+ std::cerr << "counting left-hand side target preference labels and writing them to files " << fileNameLeftHandSideTargetPreferenceLabelCounts << " and " << fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts << std::endl;
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
unpairedExtractFormatFlag = true;
std::cerr << "processing unpaired extract format" << std::endl;
@@ -155,7 +207,7 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
unalignedFWFlag = true;
if (i+1==argc) {
- std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
+ std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
exit(1);
}
fileNameFunctionWords = argv[++i];
@@ -174,6 +226,20 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
crossedNonTerm = true;
std::cerr << "crossed non-term reordering feature" << std::endl;
+ } else if (strcmp(argv[i],"--PhraseOrientationPriors") == 0) {
+ phraseOrientationPriorsFlag = true;
+ if (i+1==argc) {
+ std::cerr << "ERROR: specify priors file for phrase orientation!" << std::endl;
+ exit(1);
+ }
+ fileNamePhraseOrientationPriors = argv[++i];
+ std::cerr << "smoothing phrase orientation with priors from " << fileNamePhraseOrientationPriors << std::endl;
+ } else if (strcmp(argv[i],"--SpanLength") == 0) {
+ spanLength = true;
+ std::cerr << "span length feature" << std::endl;
+ } else if (strcmp(argv[i],"--NonTermContext") == 0) {
+ nonTermContext = true;
+ std::cerr << "non-term context" << std::endl;
} else {
featureArgs.push_back(argv[i]);
++i;
@@ -206,6 +272,10 @@ int main(int argc, char* argv[])
for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
}
+ if (phraseOrientationPriorsFlag) {
+ loadOrientationPriors(fileNamePhraseOrientationPriors,orientationClassPriorsL2R,orientationClassPriorsR2L);
+ }
+
// sorted phrase extraction file
Moses::InputFileStream extractFile(fileNameExtract);
@@ -224,15 +294,15 @@ int main(int argc, char* argv[])
Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
bool success = outputFile->Open(fileNamePhraseTable);
if (!success) {
- std::cerr << "ERROR: could not open file phrase table file "
- << fileNamePhraseTable << std::endl;
+ std::cerr << "ERROR: could not open file phrase table file "
+ << fileNamePhraseTable << std::endl;
exit(1);
}
phraseTableFile = outputFile;
}
// loop through all extracted phrase translations
- char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH];
+ string line, lastLine;
lastLine[0] = '\0';
ExtractionPhrasePair *phrasePair = NULL;
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
@@ -245,18 +315,18 @@ int main(int argc, char* argv[])
float tmpCount=0.0f, tmpPcfgSum=0.0f;
int i=0;
- SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
- if ( !extractFileP.eof() ) {
+ // TODO why read only the 1st line?
+ if ( getline(extractFileP, line) ) {
++i;
tmpPhraseSource = new PHRASE();
tmpPhraseTarget = new PHRASE();
tmpTargetToSourceAlignment = new ALIGNMENT();
- processLine( std::string(line),
+ processLine( std::string(line),
i, featureManager.includeSentenceId(), tmpSentenceId,
- tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
+ tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
tmpAdditionalPropertiesString,
tmpCount, tmpPcfgSum);
- phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
+ phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
tmpTargetToSourceAlignment,
tmpCount, tmpPcfgSum );
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
@@ -265,39 +335,35 @@ int main(int argc, char* argv[])
if ( hierarchicalFlag ) {
phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
}
- strcpy( lastLine, line );
- SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
+ lastLine = line;
}
- while ( !extractFileP.eof() ) {
+ while ( getline(extractFileP, line) ) {
if ( ++i % 100000 == 0 ) {
std::cerr << "." << std::flush;
}
// identical to last line? just add count
- if (strcmp(line,lastLine) == 0) {
+ if (line == lastLine) {
phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
- SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
continue;
} else {
- strcpy( lastLine, line );
+ lastLine = line;
}
tmpPhraseSource = new PHRASE();
tmpPhraseTarget = new PHRASE();
tmpTargetToSourceAlignment = new ALIGNMENT();
tmpAdditionalPropertiesString.clear();
- processLine( std::string(line),
+ processLine( std::string(line),
i, featureManager.includeSentenceId(), tmpSentenceId,
- tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
+ tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
tmpAdditionalPropertiesString,
- tmpCount, tmpPcfgSum);
+ tmpCount, tmpPcfgSum);
bool matchesPrevious = false;
- bool sourceMatch = true;
- bool targetMatch = true;
- bool alignmentMatch = true; // be careful with these,
+ bool sourceMatch = true; bool targetMatch = true; bool alignmentMatch = true; // be careful with these,
// ExtractionPhrasePair::Matches() checks them in order and does not continue with the others
// once the first of them has been found to have to be set to false
@@ -332,7 +398,7 @@ int main(int argc, char* argv[])
if ( !phrasePairsWithSameSource.empty() &&
!sourceMatch ) {
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
- for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
delete *iter;
}
@@ -349,8 +415,8 @@ int main(int argc, char* argv[])
}
}
- phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
- tmpTargetToSourceAlignment,
+ phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
+ tmpTargetToSourceAlignment,
tmpCount, tmpPcfgSum );
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
@@ -361,12 +427,10 @@ int main(int argc, char* argv[])
}
}
- SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-
}
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
- for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
delete *iter;
}
@@ -382,16 +446,37 @@ int main(int argc, char* argv[])
if (goodTuringFlag || kneserNeyFlag) {
writeCountOfCounts( fileNameCountOfCounts );
}
+
+ // source syntax labels
+ if (sourceSyntaxLabelsFlag && sourceSyntaxLabelSetFlag && !inverseFlag) {
+ writeLabelSet( sourceLabelSet, fileNameSourceLabelSet );
+ }
+ if (sourceSyntaxLabelsFlag && sourceSyntaxLabelCountsLHSFlag && !inverseFlag) {
+ writeLeftHandSideLabelCounts( sourceLHSCounts,
+ targetLHSAndSourceLHSJointCounts,
+ fileNameLeftHandSideSourceLabelCounts,
+ fileNameLeftHandSideTargetSourceLabelCounts );
+ }
+
+ // target preference labels
+ if (targetPreferenceLabelsFlag && !inverseFlag) {
+ writeLabelSet( targetPreferenceLabelSet, fileNameTargetPreferenceLabelSet );
+ writeLeftHandSideLabelCounts( targetPreferenceLHSCounts,
+ ruleTargetLHSAndTargetPreferenceLHSJointCounts,
+ fileNameLeftHandSideTargetPreferenceLabelCounts,
+ fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts );
+ }
}
void processLine( std::string line,
- int lineID, bool includeSentenceIdFlag, int &sentenceId,
+ int lineID, bool includeSentenceIdFlag, int &sentenceId,
PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
std::string &additionalPropertiesString,
float &count, float &pcfgSum )
{
- size_t foundAdditionalProperties = line.find("{{");
+ size_t foundAdditionalProperties = line.rfind("|||");
+ foundAdditionalProperties = line.find("{{",foundAdditionalProperties);
if (foundAdditionalProperties != std::string::npos) {
additionalPropertiesString = line.substr(foundAdditionalProperties);
line = line.substr(0,foundAdditionalProperties);
@@ -476,7 +561,71 @@ void writeCountOfCounts( const string &fileNameCountOfCounts )
}
-void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
+void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
+ const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
+ const std::string &fileNameLeftHandSideSourceLabelCounts,
+ const std::string &fileNameLeftHandSideTargetSourceLabelCounts )
+{
+ // open file
+ Moses::OutputFileStream leftHandSideSourceLabelCounts;
+ bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts.c_str());
+ if (!success) {
+ std::cerr << "ERROR: could not open left-hand side label counts file "
+ << fileNameLeftHandSideSourceLabelCounts << std::endl;
+ return;
+ }
+
+ // write source left-hand side counts
+ for (boost::unordered_map<std::string,float>::const_iterator iter=sourceLHSCounts.begin();
+ iter!=sourceLHSCounts.end(); ++iter) {
+ leftHandSideSourceLabelCounts << iter->first << " " << iter->second << std::endl;
+ }
+
+ leftHandSideSourceLabelCounts.Close();
+
+ // open file
+ Moses::OutputFileStream leftHandSideTargetSourceLabelCounts;
+ success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts.c_str());
+ if (!success) {
+ std::cerr << "ERROR: could not open left-hand side label joint counts file "
+ << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
+ return;
+ }
+
+ // write source left-hand side / target left-hand side joint counts
+ for (boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::const_iterator iter=targetLHSAndSourceLHSJointCounts.begin();
+ iter!=targetLHSAndSourceLHSJointCounts.end(); ++iter) {
+ for (boost::unordered_map<std::string,float>::const_iterator iter2=(iter->second)->begin();
+ iter2!=(iter->second)->end(); ++iter2) {
+ leftHandSideTargetSourceLabelCounts << iter->first << " "<< iter2->first << " " << iter2->second << std::endl;
+ }
+ }
+
+ leftHandSideTargetSourceLabelCounts.Close();
+}
+
+
+void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName )
+{
+ // open file
+ Moses::OutputFileStream out;
+ bool success = out.Open(fileName.c_str());
+ if (!success) {
+ std::cerr << "ERROR: could not open label set file "
+ << fileName << std::endl;
+ return;
+ }
+
+ for (std::set<std::string>::const_iterator iter=labelSet.begin();
+ iter!=labelSet.end(); ++iter) {
+ out << *iter << std::endl;
+ }
+
+ out.Close();
+}
+
+
+void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
{
if (phrasePairsWithSameSource.size() == 0) {
@@ -488,23 +637,23 @@ void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSa
//std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
// loop through phrase pairs
- for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
// add to total count
totalSource += (*iter)->GetCount();
}
// output the distinct phrase pairs, one at a time
- for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
// add to total count
outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
}
}
-void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
- float totalCount, int distinctCount,
- ostream &phraseTableFile,
+void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
+ float totalCount, int distinctCount,
+ ostream &phraseTableFile,
const ScoreFeatureManager& featureManager,
const MaybeLog& maybeLogProb )
{
@@ -559,45 +708,45 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
// alignment
if ( hierarchicalFlag ) {
- // always output alignment if hiero style
- assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
- std::vector<std::string> alignment;
- for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
- if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
- if ( bestAlignmentT2S->at(j).size() != 1 ) {
- std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
- phraseTableFile.flush();
- assert(bestAlignmentT2S->at(j).size() == 1);
- }
- size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
- //phraseTableFile << sourcePos << "-" << j << " ";
- std::stringstream point;
- point << sourcePos << "-" << j;
- alignment.push_back(point.str());
- } else {
- for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
- setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
- size_t sourcePos = *setIter;
+ // always output alignment if hiero style
+ assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
+ std::vector<std::string> alignment;
+ for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
+ if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
+ if ( bestAlignmentT2S->at(j).size() != 1 ) {
+ std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
+ phraseTableFile.flush();
+ assert(bestAlignmentT2S->at(j).size() == 1);
+ }
+ size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
+ //phraseTableFile << sourcePos << "-" << j << " ";
std::stringstream point;
point << sourcePos << "-" << j;
alignment.push_back(point.str());
+ } else {
+ for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
+ setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
+ size_t sourcePos = *setIter;
+ std::stringstream point;
+ point << sourcePos << "-" << j;
+ alignment.push_back(point.str());
+ }
}
}
- }
- // now print all alignments, sorted by source index
- sort(alignment.begin(), alignment.end());
- for (size_t i = 0; i < alignment.size(); ++i) {
- phraseTableFile << alignment[i] << " ";
- }
+ // now print all alignments, sorted by source index
+ sort(alignment.begin(), alignment.end());
+ for (size_t i = 0; i < alignment.size(); ++i) {
+ phraseTableFile << alignment[i] << " ";
+ }
} else if ( !inverseFlag && wordAlignmentFlag) {
- // alignment info in pb model
- for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
- for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
- setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
- size_t sourcePos = *setIter;
- phraseTableFile << sourcePos << "-" << j << " ";
+ // alignment info in pb model
+ for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
+ for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
+ setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
+ size_t sourcePos = *setIter;
+ phraseTableFile << sourcePos << "-" << j << " ";
+ }
}
- }
}
phraseTableFile << " ||| ";
@@ -648,11 +797,6 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
if (kneserNeyFlag)
phraseTableFile << " " << distinctCount;
- if ((treeFragmentsFlag) &&
- !inverseFlag) {
- phraseTableFile << " |||";
- }
-
phraseTableFile << " |||";
// tree fragments
@@ -663,11 +807,163 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
}
+ // syntax labels
+ if ((sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && !inverseFlag) {
+ unsigned nNTs = 1;
+ for(size_t j=0; j<phraseSource->size()-1; ++j) {
+ if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
+ ++nNTs;
+ }
+ // source syntax labels
+ if (sourceSyntaxLabelsFlag) {
+ std::string sourceLabelCounts;
+ sourceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("SourceLabels",
+ sourceLabelSet,
+ sourceLHSCounts,
+ targetLHSAndSourceLHSJointCounts,
+ vcbT);
+ if ( !sourceLabelCounts.empty() ) {
+ phraseTableFile << " {{SourceLabels "
+ << nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
+ << " "
+ << count // rule count
+ << sourceLabelCounts
+ << "}}";
+ }
+ }
+ // target preference labels
+ if (targetPreferenceLabelsFlag) {
+ std::string targetPreferenceLabelCounts;
+ targetPreferenceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
+ targetPreferenceLabelSet,
+ targetPreferenceLHSCounts,
+ ruleTargetLHSAndTargetPreferenceLHSJointCounts,
+ vcbT);
+ if ( !targetPreferenceLabelCounts.empty() ) {
+ phraseTableFile << " {{TargetPreferences "
+ << nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
+ << " "
+ << count // rule count
+ << targetPreferenceLabelCounts
+ << "}}";
+ }
+ }
+ }
+
+ // phrase orientation
+ if (phraseOrientationFlag && !inverseFlag) {
+ phraseTableFile << " {{Orientation ";
+ phrasePair.CollectAllPhraseOrientations("Orientation",orientationClassPriorsL2R,orientationClassPriorsR2L,0.5,phraseTableFile);
+ phraseTableFile << "}}";
+ }
+
+ if (spanLength && !inverseFlag) {
+ string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
+ if (!propValue.empty()) {
+ phraseTableFile << " {{SpanLength " << propValue << "}}";
+ }
+ }
+
+ if (nonTermContext && !inverseFlag) {
+ string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
+ if (!propValue.empty()) {
+ phraseTableFile << " {{NonTermContext " << propValue << "}}";
+ }
+ }
+
phraseTableFile << std::endl;
}
+void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
+ std::vector<float> &orientationClassPriorsL2R,
+ std::vector<float> &orientationClassPriorsR2L)
+{
+ assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dright dleft
+
+ std::cerr << "Loading phrase orientation priors from " << fileNamePhraseOrientationPriors;
+ ifstream inFile;
+ inFile.open(fileNamePhraseOrientationPriors.c_str());
+ if (inFile.fail()) {
+ std::cerr << " - ERROR: could not open file" << std::endl;
+ exit(1);
+ }
+
+ std::string line;
+ size_t linesRead = 0;
+ float l2rSum = 0;
+ float r2lSum = 0;
+ while (getline(inFile, line)) {
+ istringstream tokenizer(line);
+ std::string key;
+ tokenizer >> key;
+
+ bool l2rFlag = false;
+ bool r2lFlag = false;
+ if (!key.substr(0,4).compare("L2R_")) {
+ l2rFlag = true;
+ }
+ if (!key.substr(0,4).compare("R2L_")) {
+ r2lFlag = true;
+ }
+ if (!l2rFlag && !r2lFlag) {
+ std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
+ }
+ key.erase(0,4);
+
+ int orientationClassId = -1;
+ if (!key.compare("mono")) {
+ orientationClassId = 0;
+ }
+ if (!key.compare("swap")) {
+ orientationClassId = 1;
+ }
+ if (!key.compare("dright")) {
+ orientationClassId = 2;
+ }
+ if (!key.compare("dleft")) {
+ orientationClassId = 3;
+ }
+ if (orientationClassId == -1) {
+ std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
+ }
+
+ float count;
+ tokenizer >> count;
+
+ if (l2rFlag) {
+ orientationClassPriorsL2R[orientationClassId] += count;
+ l2rSum += count;
+ }
+ if (r2lFlag) {
+ orientationClassPriorsR2L[orientationClassId] += count;
+ r2lSum += count;
+ }
+
+ ++linesRead;
+ }
+
+ // normalization: return prior probabilities, not counts
+ if (l2rSum != 0) {
+ for (std::vector<float>::iterator orientationClassPriorsL2RIt = orientationClassPriorsL2R.begin();
+ orientationClassPriorsL2RIt != orientationClassPriorsL2R.end(); ++orientationClassPriorsL2RIt) {
+ *orientationClassPriorsL2RIt /= l2rSum;
+ }
+ }
+ if (r2lSum != 0) {
+ for (std::vector<float>::iterator orientationClassPriorsR2LIt = orientationClassPriorsR2L.begin();
+ orientationClassPriorsR2LIt != orientationClassPriorsR2L.end(); ++orientationClassPriorsR2LIt) {
+ *orientationClassPriorsR2LIt /= r2lSum;
+ }
+ }
+
+ std::cerr << " - read " << linesRead << " lines from orientation priors file" << std::endl;
+ inFile.close();
+}
+
+
+
bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource )
{
for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) {
@@ -675,7 +971,7 @@ bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *al
// skip
} else {
const std::set<size_t> &sourceSet = alignmentTargetToSource->at(currTarget);
- for (std::set<size_t>::const_iterator iter = sourceSet.begin();
+ for (std::set<size_t>::const_iterator iter = sourceSet.begin();
iter != sourceSet.end(); ++iter) {
size_t currSource = *iter;
@@ -752,11 +1048,9 @@ void loadFunctionWords( const string &fileName )
}
istream *inFileP = &inFile;
- char line[LINE_MAX_LENGTH];
- while(true) {
- SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- if (inFileP->eof()) break;
- std::vector<string> token = tokenize( line );
+ string line;
+ while(getline(*inFileP, line)) {
+ std::vector<string> token = tokenize( line.c_str() );
if (token.size() > 0)
functionWordList.insert( token[0] );
}
@@ -801,20 +1095,17 @@ void LexicalTable::load( const string &fileName )
}
istream *inFileP = &inFile;
- char line[LINE_MAX_LENGTH];
-
+ string line;
int i=0;
- while(true) {
+ while(getline(*inFileP, line)) {
i++;
if (i%100000 == 0) std::cerr << "." << flush;
- SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- if (inFileP->eof()) break;
- std::vector<string> token = tokenize( line );
+ std::vector<string> token = tokenize( line.c_str() );
if (token.size() != 3) {
- std::cerr << "line " << i << " in " << fileName
- << " has wrong number of tokens, skipping:" << std::endl
- << token.size() << " " << token[0] << " " << line << std::endl;
+ std::cerr << "line " << i << " in " << fileName
+ << " has wrong number of tokens, skipping:" << std::endl
+ << token.size() << " " << token[0] << " " << line << std::endl;
continue;
}
@@ -893,19 +1184,19 @@ void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget,
- const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment)
-{
-// typedef std::vector< std::set<size_t> > ALIGNMENT;
+ const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment) {
+// typedef std::vector< std::set<size_t> > ALIGNMENT;
outSourceToTargetAlignment->clear();
size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size());
outSourceToTargetAlignment->resize(numberOfSourceSymbols);
// add alignment point
for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) {
- for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin();
+ for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin();
setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) {
size_t sourcePosition = *setIter;
outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition);
}
}
}
+
diff --git a/phrase-extract/score-stsg/Jamfile b/phrase-extract/score-stsg/Jamfile
new file mode 100644
index 000000000..6ae17b565
--- /dev/null
+++ b/phrase-extract/score-stsg/Jamfile
@@ -0,0 +1 @@
+exe score-stsg : [ glob *.cpp ] ..//syntax-common ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;
diff --git a/phrase-extract/score-stsg/LexicalTable.cpp b/phrase-extract/score-stsg/LexicalTable.cpp
new file mode 100644
index 000000000..d5d7ce6ab
--- /dev/null
+++ b/phrase-extract/score-stsg/LexicalTable.cpp
@@ -0,0 +1,56 @@
+#include "LexicalTable.h"
+
+#include "util/tokenize_piece.hh"
+
+#include <cstdlib>
+#include <iostream>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+LexicalTable::LexicalTable(Vocabulary &srcVocab, Vocabulary &tgtVocab)
+ : m_srcVocab(srcVocab)
+ , m_tgtVocab(tgtVocab)
+{
+}
+
+void LexicalTable::Load(std::istream &input)
+{
+ const util::AnyCharacter delimiter(" \t");
+
+ std::string line;
+ std::string tmp;
+ int i = 0;
+ while (getline(input, line)) {
+ ++i;
+ if (i%100000 == 0) {
+ std::cerr << ".";
+ }
+
+ util::TokenIter<util::AnyCharacter> it(line, delimiter);
+
+ // Target word
+ it->CopyToString(&tmp);
+ Vocabulary::IdType tgtId = m_tgtVocab.Insert(tmp);
+ ++it;
+
+ // Source word.
+ it->CopyToString(&tmp);
+ Vocabulary::IdType srcId = m_srcVocab.Insert(tmp);
+ ++it;
+
+ // Probability.
+ it->CopyToString(&tmp);
+ double prob = atof(tmp.c_str());
+ m_table[srcId][tgtId] = prob;
+ }
+ std::cerr << std::endl;
+}
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/LexicalTable.h b/phrase-extract/score-stsg/LexicalTable.h
new file mode 100644
index 000000000..54bae1dec
--- /dev/null
+++ b/phrase-extract/score-stsg/LexicalTable.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <istream>
+#include <string>
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "Vocabulary.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+class LexicalTable
+{
+public:
+ LexicalTable(Vocabulary &, Vocabulary &);
+
+ void Load(std::istream &);
+
+ double PermissiveLookup(Vocabulary::IdType s, Vocabulary::IdType t) {
+ OuterMap::const_iterator p = m_table.find(s);
+ if (p == m_table.end()) {
+ return 1.0;
+ }
+ const InnerMap &inner = p->second;
+ InnerMap::const_iterator q = inner.find(t);
+ return q == inner.end() ? 1.0 : q->second;
+ }
+
+private:
+ typedef boost::unordered_map<Vocabulary::IdType, double> InnerMap;
+ typedef boost::unordered_map<Vocabulary::IdType, InnerMap> OuterMap;
+
+ Vocabulary &m_srcVocab;
+ Vocabulary &m_tgtVocab;
+ OuterMap m_table;
+};
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/Main.cpp b/phrase-extract/score-stsg/Main.cpp
new file mode 100644
index 000000000..4a8f7a57f
--- /dev/null
+++ b/phrase-extract/score-stsg/Main.cpp
@@ -0,0 +1,7 @@
+#include "ScoreStsg.h"
+
+int main(int argc, char *argv[])
+{
+ MosesTraining::Syntax::ScoreStsg::ScoreStsg tool;
+ return tool.Main(argc, argv);
+}
diff --git a/phrase-extract/score-stsg/Options.h b/phrase-extract/score-stsg/Options.h
new file mode 100644
index 000000000..17b959c84
--- /dev/null
+++ b/phrase-extract/score-stsg/Options.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <string>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+struct Options {
+public:
+ Options()
+ : goodTuring(false)
+ , inverse(false)
+ , kneserNey(false)
+ , logProb(false)
+ , minCountHierarchical(0)
+ , negLogProb(false)
+ , noLex(false)
+ , noWordAlignment(false)
+ , pcfg(false) {}
+
+ // Positional options
+ std::string extractFile;
+ std::string lexFile;
+ std::string tableFile;
+
+ // All other options
+ bool goodTuring;
+ bool inverse;
+ bool kneserNey;
+ bool logProb;
+ int minCountHierarchical;
+ bool negLogProb;
+ bool noLex;
+ bool noWordAlignment;
+ bool pcfg;
+};
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/RuleGroup.cpp b/phrase-extract/score-stsg/RuleGroup.cpp
new file mode 100644
index 000000000..bbbe3b2b6
--- /dev/null
+++ b/phrase-extract/score-stsg/RuleGroup.cpp
@@ -0,0 +1,45 @@
+#include "RuleGroup.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+void RuleGroup::SetNewSource(const StringPiece &source)
+{
+ source.CopyToString(&m_source);
+ m_distinctRules.clear();
+ m_totalCount = 0;
+}
+
+void RuleGroup::AddRule(const StringPiece &target, const StringPiece &ntAlign,
+ const StringPiece &fullAlign, int count)
+{
+ if (m_distinctRules.empty() ||
+ ntAlign != m_distinctRules.back().ntAlign ||
+ target != m_distinctRules.back().target) {
+ DistinctRule r;
+ target.CopyToString(&r.target);
+ ntAlign.CopyToString(&r.ntAlign);
+ r.alignments.resize(r.alignments.size()+1);
+ fullAlign.CopyToString(&r.alignments.back().first);
+ r.alignments.back().second = count;
+ r.count = count;
+ m_distinctRules.push_back(r);
+ } else {
+ DistinctRule &r = m_distinctRules.back();
+ if (r.alignments.back().first != fullAlign) {
+ r.alignments.resize(r.alignments.size()+1);
+ fullAlign.CopyToString(&r.alignments.back().first);
+ }
+ r.alignments.back().second += count;
+ r.count += count;
+ }
+ m_totalCount += count;
+}
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/RuleGroup.h b/phrase-extract/score-stsg/RuleGroup.h
new file mode 100644
index 000000000..de0c25f17
--- /dev/null
+++ b/phrase-extract/score-stsg/RuleGroup.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "util/string_piece.hh"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+// A group of rules that share the same source-side. Rules are added through
+// calls to SetNewSource() and AddRule(). They can then be accessed via the
+// iterators.
+//
+// It is assumed that rules with the same (target, ntAlign, alignment) value
+// will be added consecutively, and so will rules with the same
+// (target, ntAlign) value. In other words, it is assumed that rules will be
+// added in the order they occur in a correctly-sorted extract file.
+class RuleGroup
+{
+public:
+ // Stores the target-side and NT-alignment of a distinct rule. Also records
+ // the rule's count and the observed symbol alignments (plus their
+ // frequencies).
+ struct DistinctRule {
+ std::string target;
+ std::string ntAlign;
+ std::vector<std::pair<std::string, int> > alignments;
+ int count;
+ };
+
+ typedef std::vector<DistinctRule>::const_iterator ConstIterator;
+
+ // Begin and End iterators for iterating over the group's distinct rules.
+ ConstIterator Begin() const { return m_distinctRules.begin(); }
+ ConstIterator End() const { return m_distinctRules.end(); }
+
+ // Get the current source-side value.
+ const std::string &GetSource() const { return m_source; }
+
+ // Get the number of distinct rules.
+ int GetSize() const { return m_distinctRules.size(); }
+
+ // Get the total count.
+ int GetTotalCount() const { return m_totalCount; }
+
+ // Clear the rule group and set a new source-side value. This must be
+ // done once for every new source-side value, prior to the first call to
+ // AddRule().
+ void SetNewSource(const StringPiece &source);
+
+ // Add a rule. To determine rule distinctness, the target and ntAlign
+ // values will be checked against those of the previous rule only (in other
+ // words, the input is assumed to be ordered).
+ void AddRule(const StringPiece &target, const StringPiece &ntAlign,
+ const StringPiece &fullAlign, int count);
+
+private:
+ std::string m_source;
+ std::vector<DistinctRule> m_distinctRules;
+ int m_totalCount;
+};
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/RuleSymbol.h b/phrase-extract/score-stsg/RuleSymbol.h
new file mode 100644
index 000000000..efefe6266
--- /dev/null
+++ b/phrase-extract/score-stsg/RuleSymbol.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "util/string_piece.hh"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+struct RuleSymbol
+{
+ StringPiece value;
+ bool isNonTerminal;
+};
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/RuleTableWriter.cpp b/phrase-extract/score-stsg/RuleTableWriter.cpp
new file mode 100644
index 000000000..0a1d5aa08
--- /dev/null
+++ b/phrase-extract/score-stsg/RuleTableWriter.cpp
@@ -0,0 +1,80 @@
+#include "RuleTableWriter.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "InputFileStream.h"
+#include "LexicalTable.h"
+#include "OutputFileStream.h"
+#include "Options.h"
+#include "RuleGroup.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+void RuleTableWriter::WriteLine(const TokenizedRuleHalf &source,
+ const TokenizedRuleHalf &target,
+ const std::string &bestAlignment,
+ double lexScore, int count, int totalCount,
+ int distinctCount)
+{
+ if (m_options.inverse) {
+ WriteRuleHalf(target);
+ m_out << " ||| ";
+ WriteRuleHalf(source);
+ } else {
+ WriteRuleHalf(source);
+ m_out << " ||| ";
+ WriteRuleHalf(target);
+ }
+
+ m_out << " |||" << bestAlignment << "||| ";
+
+ if (!m_options.noLex) {
+ m_out << MaybeLog(lexScore);
+ }
+
+ // TODO PCFG
+
+ m_out << " ||| " << totalCount << " " << count;
+ if (m_options.kneserNey) {
+ m_out << " " << distinctCount;
+ }
+ m_out << " |||";
+ m_out << std::endl;
+}
+
+void RuleTableWriter::WriteRuleHalf(const TokenizedRuleHalf &half)
+{
+ if (half.IsTree()) {
+ m_out << half.string;
+ return;
+ }
+
+ for (std::vector<RuleSymbol>::const_iterator p = half.frontierSymbols.begin();
+ p != half.frontierSymbols.end(); ++p) {
+ if (p->isNonTerminal) {
+ m_out << "[" << p->value << "][" << p->value << "] ";
+ } else {
+ m_out << p->value << " ";
+ }
+ }
+ m_out << "[X]";
+}
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/RuleTableWriter.h b/phrase-extract/score-stsg/RuleTableWriter.h
new file mode 100644
index 000000000..db8924de3
--- /dev/null
+++ b/phrase-extract/score-stsg/RuleTableWriter.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <cmath>
+#include <string>
+
+#include "OutputFileStream.h"
+
+#include "Options.h"
+#include "TokenizedRuleHalf.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+class RuleTableWriter
+{
+public:
+ RuleTableWriter(const Options &options, Moses::OutputFileStream &out)
+ : m_options(options)
+ , m_out(out) {}
+
+ void WriteLine(const TokenizedRuleHalf &, const TokenizedRuleHalf &,
+ const std::string &, double, int, int, int);
+
+private:
+ double MaybeLog(double a) const {
+ if (!m_options.logProb) {
+ return a;
+ }
+ return m_options.negLogProb ? -log(a) : log(a);
+ }
+
+ void WriteRuleHalf(const TokenizedRuleHalf &);
+
+ const Options &m_options;
+ Moses::OutputFileStream &m_out;
+};
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/ScoreStsg.cpp b/phrase-extract/score-stsg/ScoreStsg.cpp
new file mode 100644
index 000000000..04e3b5a44
--- /dev/null
+++ b/phrase-extract/score-stsg/ScoreStsg.cpp
@@ -0,0 +1,445 @@
+#include "ScoreStsg.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include "util/string_piece.hh"
+#include "util/string_piece_hash.hh"
+#include "util/tokenize_piece.hh"
+
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+
+#include "syntax-common/exception.h"
+
+#include "LexicalTable.h"
+#include "Options.h"
+#include "RuleGroup.h"
+#include "RuleTableWriter.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+const int ScoreStsg::kCountOfCountsMax = 10;
+
+ScoreStsg::ScoreStsg()
+ : m_name("score-stsg")
+ , m_lexTable(m_srcVocab, m_tgtVocab)
+ , m_countOfCounts(kCountOfCountsMax, 0)
+ , m_totalDistinct(0)
+{
+}
+
+int ScoreStsg::Main(int argc, char *argv[])
+{
+ // Process command-line options.
+ ProcessOptions(argc, argv, m_options);
+
+ // Open input files.
+ Moses::InputFileStream extractStream(m_options.extractFile);
+ Moses::InputFileStream lexStream(m_options.lexFile);
+
+ // Open output files.
+ Moses::OutputFileStream outStream;
+ Moses::OutputFileStream countOfCountsStream;
+ OpenOutputFileOrDie(m_options.tableFile, outStream);
+ if (m_options.goodTuring || m_options.kneserNey) {
+ OpenOutputFileOrDie(m_options.tableFile+".coc", countOfCountsStream);
+ }
+
+ // Load lexical table.
+ if (!m_options.noLex) {
+ m_lexTable.Load(lexStream);
+ }
+
+ const util::MultiCharacter delimiter("|||");
+ std::size_t lineNum = 0;
+ std::size_t startLine= 0;
+ std::string line;
+ std::string tmp;
+ RuleGroup ruleGroup;
+ RuleTableWriter ruleTableWriter(m_options, outStream);
+
+ while (std::getline(extractStream, line)) {
+ ++lineNum;
+
+ // Tokenize the input line.
+ util::TokenIter<util::MultiCharacter> it(line, delimiter);
+ StringPiece source = *it++;
+ StringPiece target = *it++;
+ StringPiece ntAlign = *it++;
+ StringPiece fullAlign = *it++;
+ it->CopyToString(&tmp);
+ int count = std::atoi(tmp.c_str());
+
+ // If this is the first line or if source has changed since the last
+ // line then process the current rule group and start a new one.
+ if (source != ruleGroup.GetSource()) {
+ if (lineNum > 1) {
+ ProcessRuleGroupOrDie(ruleGroup, ruleTableWriter, startLine, lineNum-1);
+ }
+ startLine = lineNum;
+ ruleGroup.SetNewSource(source);
+ }
+
+ // Add the rule to the current rule group.
+ ruleGroup.AddRule(target, ntAlign, fullAlign, count);
+ }
+
+ // Process the final rule group.
+ ProcessRuleGroupOrDie(ruleGroup, ruleTableWriter, startLine, lineNum);
+
+ // Write count of counts file.
+ if (m_options.goodTuring || m_options.kneserNey) {
+ // Kneser-Ney needs the total number of distinct rules.
+ countOfCountsStream << m_totalDistinct << std::endl;
+ // Write out counts of counts.
+ for (int i = 1; i <= kCountOfCountsMax; ++i) {
+ countOfCountsStream << m_countOfCounts[i] << std::endl;
+ }
+ }
+
+ return 0;
+}
+
+void ScoreStsg::TokenizeRuleHalf(const std::string &s, TokenizedRuleHalf &half)
+{
+ // Copy s to half.string, but strip any leading or trailing whitespace.
+ std::size_t start = s.find_first_not_of(" \t");
+ if (start == std::string::npos) {
+ throw Exception("rule half is empty");
+ }
+ std::size_t end = s.find_last_not_of(" \t");
+ assert(end != std::string::npos);
+ half.string = s.substr(start, end-start+1);
+
+ // Tokenize half.string.
+ half.tokens.clear();
+ for (TreeFragmentTokenizer p(half.string);
+ p != TreeFragmentTokenizer(); ++p) {
+ half.tokens.push_back(*p);
+ }
+
+ // Extract the frontier symbols.
+ half.frontierSymbols.clear();
+ const std::size_t numTokens = half.tokens.size();
+ for (int i = 0; i < numTokens; ++i) {
+ if (half.tokens[i].type != TreeFragmentToken_WORD) {
+ continue;
+ }
+ if (i == 0 || half.tokens[i-1].type != TreeFragmentToken_LSB) {
+ // A word is a terminal iff it doesn't follow '['
+ half.frontierSymbols.resize(half.frontierSymbols.size()+1);
+ half.frontierSymbols.back().value = half.tokens[i].value;
+ half.frontierSymbols.back().isNonTerminal = false;
+ } else if (i+1 < numTokens &&
+ half.tokens[i+1].type == TreeFragmentToken_RSB) {
+ // A word is a non-terminal iff it it follows '[' and is succeeded by ']'
+ half.frontierSymbols.resize(half.frontierSymbols.size()+1);
+ half.frontierSymbols.back().value = half.tokens[i].value;
+ half.frontierSymbols.back().isNonTerminal = true;
+ ++i; // Skip over the ']'
+ }
+ }
+}
+
+void ScoreStsg::ProcessRuleGroupOrDie(const RuleGroup &group,
+ RuleTableWriter &writer,
+ std::size_t start,
+ std::size_t end)
+{
+ try {
+ ProcessRuleGroup(group, writer);
+ } catch (const Exception &e) {
+ std::ostringstream msg;
+ msg << "failed to process rule group at lines " << start << "-" << end
+ << ": " << e.msg();
+ Error(msg.str());
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << "failed to process rule group at lines " << start << "-" << end
+ << ": " << e.what();
+ Error(msg.str());
+ }
+}
+
+void ScoreStsg::ProcessRuleGroup(const RuleGroup &group,
+ RuleTableWriter &writer)
+{
+ const std::size_t totalCount = group.GetTotalCount();
+ const std::size_t distinctCount = group.GetSize();
+
+ TokenizeRuleHalf(group.GetSource(), m_sourceHalf);
+
+ const bool fullyLexical = m_sourceHalf.IsFullyLexical();
+
+ // Process each distinct rule in turn.
+ for (RuleGroup::ConstIterator p = group.Begin(); p != group.End(); ++p) {
+ const RuleGroup::DistinctRule &rule = *p;
+
+ // Update count of count statistics.
+ if (m_options.goodTuring || m_options.kneserNey) {
+ ++m_totalDistinct;
+ int countInt = rule.count + 0.99999;
+ if (countInt <= kCountOfCountsMax) {
+ ++m_countOfCounts[countInt];
+ }
+ }
+
+ // If the rule is not fully lexical then discard it if the count is below
+ // the threshold value.
+ if (!fullyLexical && rule.count < m_options.minCountHierarchical) {
+ continue;
+ }
+
+ TokenizeRuleHalf(rule.target, m_targetHalf);
+
+ // Find the most frequent alignment (if there's a tie, take the first one).
+ std::vector<std::pair<std::string, int> >::const_iterator q =
+ rule.alignments.begin();
+ const std::pair<std::string, int> *bestAlignmentAndCount = &(*q++);
+ for (; q != rule.alignments.end(); ++q) {
+ if (q->second > bestAlignmentAndCount->second) {
+ bestAlignmentAndCount = &(*q);
+ }
+ }
+ const std::string &bestAlignment = bestAlignmentAndCount->first;
+ ParseAlignmentString(bestAlignment, m_targetHalf.frontierSymbols.size(),
+ m_tgtToSrc);
+
+ // Compute the lexical translation probability.
+ double lexProb = ComputeLexProb(m_sourceHalf.frontierSymbols,
+ m_targetHalf.frontierSymbols, m_tgtToSrc);
+
+ // TODO PCFG score
+
+ // Write a line to the rule table.
+ writer.WriteLine(m_sourceHalf, m_targetHalf, bestAlignment, lexProb,
+ p->count, totalCount, distinctCount);
+ }
+}
+
+void ScoreStsg::ParseAlignmentString(const std::string &s, int numTgtWords,
+ ALIGNMENT &tgtToSrc)
+{
+ tgtToSrc.clear();
+ tgtToSrc.resize(numTgtWords);
+
+ const std::string digits = "0123456789";
+
+ std::string::size_type begin = 0;
+ while (true) {
+ std::string::size_type end = s.find("-", begin);
+ if (end == std::string::npos) {
+ return;
+ }
+ int src = std::atoi(s.substr(begin, end-begin).c_str());
+ if (end+1 == s.size()) {
+ throw Exception("Target index missing");
+ }
+ begin = end+1;
+ end = s.find_first_not_of(digits, begin+1);
+ int tgt;
+ if (end == std::string::npos) {
+ tgt = std::atoi(s.substr(begin).c_str());
+ tgtToSrc[tgt].insert(src);
+ return;
+ } else {
+ tgt = std::atoi(s.substr(begin, end-begin).c_str());
+ tgtToSrc[tgt].insert(src);
+ }
+ begin = end+1;
+ }
+}
+
+double ScoreStsg::ComputeLexProb(const std::vector<RuleSymbol> &sourceFrontier,
+ const std::vector<RuleSymbol> &targetFrontier,
+ const ALIGNMENT &tgtToSrc)
+{
+ double lexScore = 1.0;
+ for (std::size_t i = 0; i < targetFrontier.size(); ++i) {
+ if (targetFrontier[i].isNonTerminal) {
+ continue;
+ }
+ Vocabulary::IdType tgtId = m_tgtVocab.Lookup(targetFrontier[i].value,
+ StringPieceCompatibleHash(),
+ StringPieceCompatibleEquals());
+ const std::set<std::size_t> &srcIndices = tgtToSrc[i];
+ if (srcIndices.empty()) {
+ // Explain unaligned word by NULL.
+ lexScore *= m_lexTable.PermissiveLookup(Vocabulary::NullId(), tgtId);
+ } else {
+ double thisWordScore = 0.0;
+ for (std::set<std::size_t>::const_iterator p = srcIndices.begin();
+ p != srcIndices.end(); ++p) {
+ Vocabulary::IdType srcId =
+ m_srcVocab.Lookup(sourceFrontier[*p].value,
+ StringPieceCompatibleHash(),
+ StringPieceCompatibleEquals());
+ thisWordScore += m_lexTable.PermissiveLookup(srcId, tgtId);
+ }
+ lexScore *= thisWordScore / static_cast<double>(srcIndices.size());
+ }
+ }
+ return lexScore;
+}
+
+void ScoreStsg::OpenOutputFileOrDie(const std::string &filename,
+ Moses::OutputFileStream &stream)
+{
+ bool ret = stream.Open(filename);
+ if (!ret) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+}
+
+void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
+{
+ namespace po = boost::program_options;
+ namespace cls = boost::program_options::command_line_style;
+
+ // Construct the 'top' of the usage message: the bit that comes before the
+ // options list.
+ std::ostringstream usageTop;
+ usageTop << "Usage: " << GetName()
+ << " [OPTION]... EXTRACT LEX TABLE\n\n"
+ << "STSG rule scorer\n\n"
+ << "Options";
+
+ // Construct the 'bottom' of the usage message.
+ std::ostringstream usageBottom;
+ usageBottom << "TODO";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usageTop.str());
+ visible.add_options()
+ ("GoodTuring",
+ "apply Good-Turing smoothing to relative frequency probability estimates")
+ ("Hierarchical",
+ "ignored (included for compatibility with score)")
+ ("Inverse",
+ "use inverse mode")
+ ("KneserNey",
+ "apply Kneser-Ney smoothing to relative frequency probability estimates")
+ ("LogProb",
+ "output log probabilities")
+ ("MinCountHierarchical",
+ po::value(&options.minCountHierarchical)->
+ default_value(options.minCountHierarchical),
+ "filter out rules with frequency < arg (except fully lexical rules)")
+ ("NegLogProb",
+ "output negative log probabilities")
+ ("NoLex",
+ "do not compute lexical translation score")
+ ("NoWordAlignment",
+ "do not output word alignments")
+ ("PCFG",
+ "include pre-computed PCFG score from extract")
+ ("UnpairedExtractFormat",
+ "ignored (included for compatibility with score)")
+ ;
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options()
+ ("ExtractFile",
+ po::value(&options.extractFile),
+ "extract file")
+ ("LexFile",
+ po::value(&options.lexFile),
+ "lexical probability file")
+ ("TableFile",
+ po::value(&options.tableFile),
+ "output file")
+ ;
+
+ // Compose the full set of command-line options.
+ po::options_description cmdLineOptions;
+ cmdLineOptions.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+ p.add("ExtractFile", 1);
+ p.add("LexFile", 1);
+ p.add("TableFile", 1);
+
+ // Process the command-line.
+ po::variables_map vm;
+ const int optionStyle = cls::allow_long
+ | cls::long_allow_adjacent
+ | cls::long_allow_next;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(optionStyle).
+ options(cmdLineOptions).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible << usageBottom.str();
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << usageBottom.str() << std::endl;
+ std::exit(0);
+ }
+
+ // Check all positional options were given.
+ if (!vm.count("ExtractFile") ||
+ !vm.count("LexFile") ||
+ !vm.count("TableFile")) {
+ std::ostringstream msg;
+ std::cerr << visible << usageBottom.str() << std::endl;
+ std::exit(1);
+ }
+
+ // Process Boolean options.
+ if (vm.count("GoodTuring")) {
+ options.goodTuring = true;
+ }
+ if (vm.count("Inverse")) {
+ options.inverse = true;
+ }
+ if (vm.count("KneserNey")) {
+ options.kneserNey = true;
+ }
+ if (vm.count("LogProb")) {
+ options.logProb = true;
+ }
+ if (vm.count("NegLogProb")) {
+ options.negLogProb = true;
+ }
+ if (vm.count("NoLex")) {
+ options.noLex = true;
+ }
+ if (vm.count("NoWordAlignment")) {
+ options.noWordAlignment = true;
+ }
+ if (vm.count("PCFG")) {
+ options.pcfg = true;
+ }
+}
+
+void ScoreStsg::Error(const std::string &msg) const
+{
+ std::cerr << GetName() << ": " << msg << std::endl;
+ std::exit(1);
+}
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/ScoreStsg.h b/phrase-extract/score-stsg/ScoreStsg.h
new file mode 100644
index 000000000..628c0080e
--- /dev/null
+++ b/phrase-extract/score-stsg/ScoreStsg.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <map>
+#include <ostream>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "ExtractionPhrasePair.h"
+#include "OutputFileStream.h"
+
+#include "LexicalTable.h"
+#include "Options.h"
+#include "RuleSymbol.h"
+#include "TokenizedRuleHalf.h"
+#include "Vocabulary.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+class RuleGroup;
+class RuleTableWriter;
+
+class ScoreStsg
+{
+public:
+ ScoreStsg();
+
+ const std::string &GetName() const {
+ return m_name;
+ }
+
+ int Main(int argc, char *argv[]);
+
+private:
+ static const int kCountOfCountsMax;
+
+ double ComputeLexProb(const std::vector<RuleSymbol> &,
+ const std::vector<RuleSymbol> &,
+ const ALIGNMENT &);
+
+ void Error(const std::string &) const;
+
+ void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &);
+
+ void ParseAlignmentString(const std::string &, int,
+ ALIGNMENT &);
+
+ void ProcessOptions(int, char *[], Options &) const;
+
+ void ProcessRuleGroup(const RuleGroup &, RuleTableWriter &);
+
+ void ProcessRuleGroupOrDie(const RuleGroup &, RuleTableWriter &,
+ std::size_t, std::size_t);
+
+ void TokenizeRuleHalf(const std::string &, TokenizedRuleHalf &);
+
+ std::string m_name;
+ Options m_options;
+ Vocabulary m_srcVocab;
+ Vocabulary m_tgtVocab;
+ LexicalTable m_lexTable;
+ std::vector<int> m_countOfCounts;
+ int m_totalDistinct;
+ TokenizedRuleHalf m_sourceHalf;
+ TokenizedRuleHalf m_targetHalf;
+ ALIGNMENT m_tgtToSrc;
+};
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/TokenizedRuleHalf.cpp b/phrase-extract/score-stsg/TokenizedRuleHalf.cpp
new file mode 100644
index 000000000..6ccc2a311
--- /dev/null
+++ b/phrase-extract/score-stsg/TokenizedRuleHalf.cpp
@@ -0,0 +1,40 @@
+#include "TokenizedRuleHalf.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+bool TokenizedRuleHalf::IsFullyLexical() const
+{
+ for (std::vector<RuleSymbol>::const_iterator p = frontierSymbols.begin();
+ p != frontierSymbols.end(); ++p) {
+ if (p->isNonTerminal) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool TokenizedRuleHalf::IsString() const
+{
+ // A rule half is either a string (like "[X] and [X]") or a tree (like
+ // "[NP [NP] [CC and] [NP]]").
+ //
+ // A string must start with a terminal or a non-terminal (in square brackets).
+ // A tree must start with '[' followed by a word then either another word or
+ // another '['.
+ return (tokens[0].type == TreeFragmentToken_WORD ||
+ tokens[2].type == TreeFragmentToken_RSB);
+}
+
+bool TokenizedRuleHalf::IsTree() const
+{
+ return !IsString();
+}
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/TokenizedRuleHalf.h b/phrase-extract/score-stsg/TokenizedRuleHalf.h
new file mode 100644
index 000000000..2fbb80f38
--- /dev/null
+++ b/phrase-extract/score-stsg/TokenizedRuleHalf.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "syntax-common/tree_fragment_tokenizer.h"
+
+#include "RuleSymbol.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace ScoreStsg
+{
+
+// Stores one half of a STSG rule, as represented in the extract file. The
+// original string is stored as the member 'string', along with its token
+// sequence ('tokens') and frontier symbol sequence ('frontierSymbols'). Note
+// that 'tokens' and 'frontierSymbols' use StringPiece objects that depend on
+// the original string. Therefore changing the value of 'string' invalidates
+// both 'tokens' and 'frontierSymbols'.
+struct TokenizedRuleHalf
+{
+ bool IsFullyLexical() const;
+ bool IsString() const;
+ bool IsTree() const;
+
+ // The rule half as it appears in the extract file, except with any trailing
+ // or leading spaces removed (here a space is defined as a blank or a tab).
+ std::string string;
+
+ // The token sequence for the string.
+ std::vector<TreeFragmentToken> tokens;
+
+ // The frontier symbols of the rule half. For example:
+ //
+ // string: "[VP [VBN] [PP [IN] [NP [DT] [JJ positive] [NN light]]]]"
+ // frontier: ("VBN",t), ("IN",t), ("DT",t), ("positive",f), ("light",f)
+ //
+ // string: "[X] [X] Sinne [X]"
+ // frontier: ("X",t), ("X",t), ("Sinne",f), ("X",t)
+ //
+ std::vector<RuleSymbol> frontierSymbols;
+};
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/Vocabulary.h b/phrase-extract/score-stsg/Vocabulary.h
new file mode 100644
index 000000000..db31c73f5
--- /dev/null
+++ b/phrase-extract/score-stsg/Vocabulary.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <string>
+
+#include "syntax-common/numbered_set.h"
+
+namespace MosesTraining {
+namespace Syntax {
+namespace ScoreStsg {
+
+typedef NumberedSet<std::string, std::size_t> Vocabulary;
+
+} // namespace ScoreStsg
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/score.h b/phrase-extract/score.h
index 6a10536c1..470332a06 100644
--- a/phrase-extract/score.h
+++ b/phrase-extract/score.h
@@ -1,12 +1,22 @@
-#pragma once
-/*
- * score.h
- * extract
- *
- * Created by Hieu Hoang on 28/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
#include <string>
#include <vector>
diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp
index 67373ec93..9d814ed76 100644
--- a/phrase-extract/statistics-main.cpp
+++ b/phrase-extract/statistics-main.cpp
@@ -12,15 +12,12 @@
#include <time.h>
#include "AlignmentPhrase.h"
-#include "SafeGetline.h"
#include "tables-core.h"
#include "InputFileStream.h"
using namespace std;
using namespace MosesTraining;
-#define LINE_MAX_LENGTH 10000
-
namespace MosesTraining
{
@@ -31,7 +28,7 @@ public:
vector< vector<size_t> > alignedToE;
vector< vector<size_t> > alignedToF;
- bool create( char*, int );
+ bool create( const char*, int );
void clear();
bool equals( const PhraseAlignment& );
};
@@ -106,16 +103,14 @@ int main(int argc, char* argv[])
vector< PhraseAlignment > phrasePairsWithSameF;
int i=0;
int fileCount = 0;
- while(true) {
+
+ string line;
+ while(getline(extractFileP, line)) {
if (extractFileP.eof()) break;
if (++i % 100000 == 0) cerr << "." << flush;
- char line[LINE_MAX_LENGTH];
- SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- // if (fileCount>0)
- if (extractFileP.eof())
- break;
+
PhraseAlignment phrasePair;
- bool isPhrasePair = phrasePair.create( line, i );
+ bool isPhrasePair = phrasePair.create( line.c_str(), i );
if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
processPhrasePairs( phrasePairsWithSameF );
for(size_t j=0; j<phrasePairsWithSameF.size(); j++)
@@ -124,7 +119,7 @@ int main(int argc, char* argv[])
phraseTableE.clear();
phraseTableF.clear();
phrasePair.clear(); // process line again, since phrase tables flushed
- phrasePair.create( line, i );
+ phrasePair.create( line.c_str(), i );
phrasePairBase = 0;
}
lastForeign = phrasePair.foreign;
@@ -242,7 +237,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
}
}
-bool PhraseAlignment::create( char line[], int lineID )
+bool PhraseAlignment::create(const char line[], int lineID )
{
vector< string > token = tokenize( line );
int item = 1;
@@ -321,16 +316,14 @@ void LexicalTable::load( const string &filePath )
}
istream *inFileP = &inFile;
- char line[LINE_MAX_LENGTH];
+ string line;
int i=0;
- while(true) {
+ while(getline(*inFileP, line)) {
i++;
if (i%100000 == 0) cerr << "." << flush;
- SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- if (inFileP->eof()) break;
- vector<string> token = tokenize( line );
+ vector<string> token = tokenize( line.c_str() );
if (token.size() != 3) {
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
token.size() << " " << token[0] << " " << line << endl;
diff --git a/phrase-extract/syntax-common/Jamfile b/phrase-extract/syntax-common/Jamfile
new file mode 100644
index 000000000..c76ab50a5
--- /dev/null
+++ b/phrase-extract/syntax-common/Jamfile
@@ -0,0 +1,8 @@
+lib syntax_common : [ glob *.cc : *_test.cc ] ..//deps : <include>.. ;
+
+import testing ;
+
+for local t in [ glob *_test.cc ] {
+ local name = [ MATCH "(.*)\.cc" : $(t) ] ;
+ unit-test $(name) : $(t) syntax_common /top//boost_unit_test_framework /top//boost_system ;
+}
diff --git a/phrase-extract/syntax-common/exception.h b/phrase-extract/syntax-common/exception.h
new file mode 100644
index 000000000..18d529fc3
--- /dev/null
+++ b/phrase-extract/syntax-common/exception.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <string>
+
+namespace MosesTraining {
+namespace Syntax {
+
+class Exception {
+ public:
+ Exception(const char *msg) : msg_(msg) {}
+ Exception(const std::string &msg) : msg_(msg) {}
+
+ const std::string &msg() const { return msg_; }
+
+ private:
+ std::string msg_;
+};
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/numbered_set.h b/phrase-extract/syntax-common/numbered_set.h
new file mode 100644
index 000000000..60933fe96
--- /dev/null
+++ b/phrase-extract/syntax-common/numbered_set.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <limits>
+#include <sstream>
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "exception.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+// Stores a set of elements of type T, each of which is allocated an integral
+// ID of type I. IDs are contiguous starting at 0. Individual elements cannot
+// be removed once inserted (but the whole set can be cleared).
+template<typename T, typename I=size_t>
+class NumberedSet {
+ private:
+ typedef boost::unordered_map<T, I> ElementToIdMap;
+ typedef std::vector<const T *> IdToElementMap;
+
+ public:
+ typedef I IdType;
+ typedef typename IdToElementMap::const_iterator const_iterator;
+
+ NumberedSet() {}
+
+ const_iterator begin() const { return id_to_element_.begin(); }
+ const_iterator end() const { return id_to_element_.end(); }
+
+ // Static value
+ static I NullId() { return std::numeric_limits<I>::max(); }
+
+ bool IsEmpty() const { return id_to_element_.empty(); }
+ size_t Size() const { return id_to_element_.size(); }
+
+ // Insert the given object and return its ID.
+ I Insert(const T &);
+
+ // Look up the given object and return its ID.
+ I Lookup(const T &) const;
+
+ // Look up the given object using an alternative key type (this is useful if,
+ // for example, T is std::string and you want to look up the object using a
+ // StringPiece).
+ template<typename CompatibleKey, typename CompatibleHash,
+ typename CompatiblePredicate>
+ I Lookup(const CompatibleKey &, const CompatibleHash &,
+ const CompatiblePredicate &) const;
+
+ // Lookup the object with the given ID.
+ const T &Lookup(I) const;
+
+ void Clear();
+
+ private:
+ ElementToIdMap element_to_id_;
+ IdToElementMap id_to_element_;
+};
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Lookup(const T &s) const {
+ typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
+ return (p == element_to_id_.end()) ? NullId() : p->second;
+}
+
+template<typename T, typename I>
+template<typename CompatibleKey, typename CompatibleHash,
+ typename CompatiblePredicate>
+I NumberedSet<T, I>::Lookup(const CompatibleKey &key,
+ const CompatibleHash &hash,
+ const CompatiblePredicate &pred) const {
+ typename ElementToIdMap::const_iterator p =
+ element_to_id_.find(key, hash, pred);
+ return (p == element_to_id_.end()) ? NullId() : p->second;
+}
+
+template<typename T, typename I>
+const T &NumberedSet<T, I>::Lookup(I id) const {
+ // FIXME Need to check id is > 0 iff I is a signed type.
+ //if (id < 0 || id >= id_to_element_.size()) {
+ if (id >= id_to_element_.size()) {
+ std::ostringstream msg;
+ msg << "Value not found: " << id;
+ throw Exception(msg.str());
+ }
+ return *(id_to_element_[id]);
+}
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Insert(const T &x) {
+ std::pair<T, I> value(x, id_to_element_.size());
+ std::pair<typename ElementToIdMap::iterator, bool> result =
+ element_to_id_.insert(value);
+ if (result.second) {
+ // x is a new element.
+ id_to_element_.push_back(&result.first->first);
+ }
+ return result.first->second;
+}
+
+template<typename T, typename I>
+void NumberedSet<T, I>::Clear() {
+ element_to_id_.clear();
+ id_to_element_.clear();
+}
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/string_tree.h b/phrase-extract/syntax-common/string_tree.h
new file mode 100644
index 000000000..c1676e72c
--- /dev/null
+++ b/phrase-extract/syntax-common/string_tree.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <string>
+
+#include "tree.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+typedef Tree<std::string> StringTree;
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tree-inl.h b/phrase-extract/syntax-common/tree-inl.h
new file mode 100644
index 000000000..2ba55df1a
--- /dev/null
+++ b/phrase-extract/syntax-common/tree-inl.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <stack>
+#include <vector>
+
+namespace MosesTraining {
+namespace Syntax {
+
+template<typename T>
+Tree<T>::~Tree() {
+ for (typename std::vector<Tree *>::iterator p = children_.begin();
+ p != children_.end(); ++p) {
+ delete *p;
+ }
+}
+
+template<typename T>
+void Tree<T>::SetParents() {
+ for (typename std::vector<Tree *>::iterator p = children_.begin();
+ p != children_.end(); ++p) {
+ (*p)->parent() = this;
+ (*p)->SetParents();
+ }
+}
+
+template<typename T>
+std::size_t Tree<T>::Depth() const {
+ std::size_t depth = 0;
+ Tree *ancestor = parent_;
+ while (ancestor != 0) {
+ ++depth;
+ ancestor = ancestor->parent_;
+ }
+ return depth;
+}
+
+template<typename T>
+class Tree<T>::PreOrderIterator {
+ public:
+ PreOrderIterator();
+ PreOrderIterator(Tree<T> &);
+
+ Tree<T> &operator*() { return *node_; }
+ Tree<T> *operator->() { return node_; }
+
+ PreOrderIterator &operator++();
+ PreOrderIterator operator++(int);
+
+ bool operator==(const Tree<T>::PreOrderIterator &);
+ bool operator!=(const Tree<T>::PreOrderIterator &);
+
+ private:
+ // Pointer to the current node.
+ Tree<T> *node_;
+
+ // Stack of indices defining the position of node_ within the child vectors
+ // of its ancestors.
+ std::stack<std::size_t> index_stack_;
+};
+
+template<typename T>
+Tree<T>::PreOrderIterator::PreOrderIterator()
+ : node_(0) {
+}
+
+template<typename T>
+Tree<T>::PreOrderIterator::PreOrderIterator(Tree<T> &t)
+ : node_(&t) {
+}
+
+template<typename T>
+typename Tree<T>::PreOrderIterator &Tree<T>::PreOrderIterator::operator++() {
+ // If the current node has children then visit the left-most child next.
+ if (!node_->children().empty()) {
+ index_stack_.push(0);
+ node_ = node_->children()[0];
+ return *this;
+ }
+ // Otherwise, try node's ancestors until either a node is found with a
+ // sibling to the right or we reach the root (in which case the traversal
+ // is complete).
+ Tree<T> *ancestor = node_->parent_;
+ while (ancestor) {
+ std::size_t index = index_stack_.top();
+ index_stack_.pop();
+ if (index+1 < ancestor->children_.size()) {
+ index_stack_.push(index+1);
+ node_ = ancestor->children()[index+1];
+ return *this;
+ }
+ ancestor = ancestor->parent_;
+ }
+ node_ = 0;
+ return *this;
+}
+
+template<typename T>
+typename Tree<T>::PreOrderIterator Tree<T>::PreOrderIterator::operator++(int) {
+ PreOrderIterator tmp(*this);
+ ++*this;
+ return tmp;
+}
+
+template<typename T>
+bool Tree<T>::PreOrderIterator::operator==(const PreOrderIterator &rhs) {
+ return node_ == rhs.node_;
+}
+
+template<typename T>
+bool Tree<T>::PreOrderIterator::operator!=(const PreOrderIterator &rhs) {
+ return node_ != rhs.node_;
+}
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tree.h b/phrase-extract/syntax-common/tree.h
new file mode 100644
index 000000000..52adaa699
--- /dev/null
+++ b/phrase-extract/syntax-common/tree.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <vector>
+
+namespace MosesTraining {
+namespace Syntax {
+
+// A basic k-ary tree with node values of type T. Each node has a vector of
+// pointers to its children and a pointer to its parent (or 0 for the root).
+//
+// See the unit tests in tree_test.cc for examples of usage.
+//
+// Note: a Tree owns its children: it will delete them on destruction.
+//
+// Note: it's the user's responsibility to ensure that parent and child pointers
+// are correctly set and maintained. A convenient(-ish) way of building a
+// properly-connected tree is to add all the nodes as children of their
+// respective parents (using the children() accessor) and then call
+// SetParents() on the root at the end.
+//
+template<typename T>
+class Tree {
+ public:
+ // Constructors
+ Tree()
+ : value_()
+ , children_()
+ , parent_(0) {}
+
+ Tree(const T &value)
+ : value_(value)
+ , children_()
+ , parent_(0) {}
+
+ // Destructor (deletes children)
+ ~Tree();
+
+ // Access tree's value.
+ const T &value() const { return value_; }
+ T &value() { return value_; }
+
+ // Access tree's parent.
+ const Tree *parent() const { return parent_; }
+ Tree *&parent() { return parent_; }
+
+ // Access tree's children.
+ const std::vector<Tree *> &children() const { return children_; }
+ std::vector<Tree *> &children() { return children_; }
+
+ // Set the parent values for this subtree (excluding this node).
+ void SetParents();
+
+ // Leaf predicate.
+ bool IsLeaf() const { return children_.empty(); }
+
+ // Calculate the depth of this node within the tree (where the root has a
+ // depth of 0, root's children have a depth 1, etc).
+ std::size_t Depth() const;
+
+ // Iterators
+ //
+ // All iterators are forward iterators. Example use:
+ //
+ // Tree<int> &root = GetMeATree();
+ // for (Tree<int>::PreOrderIterator p(root);
+ // p != Tree<int>::PreOrderIterator(); ++p) {
+ // std::cout << p->value() << " ";
+ // }
+
+ // Pre-order iterators.
+ class PreOrderIterator;
+ // class ConstPreOrderIterator; TODO
+
+ // Post-order iterators.
+ // class PostOrderIterator; TODO
+ // class ConstPostOrderIterator; TODO
+
+ // Leaf iterators (left-to-right).
+ // class LeafIterator; TODO
+ // class ConstLeafIterator; TODO
+
+ private:
+ T value_;
+ std::vector<Tree *> children_;
+ Tree *parent_;
+};
+
+} // namespace Syntax
+} // namespace MosesTraining
+
+#include "tree-inl.h"
diff --git a/phrase-extract/syntax-common/tree_fragment_tokenizer.cc b/phrase-extract/syntax-common/tree_fragment_tokenizer.cc
new file mode 100644
index 000000000..ab3db3a84
--- /dev/null
+++ b/phrase-extract/syntax-common/tree_fragment_tokenizer.cc
@@ -0,0 +1,90 @@
+#include "tree_fragment_tokenizer.h"
+
+#include <cctype>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+
+TreeFragmentToken::TreeFragmentToken(TreeFragmentTokenType t,
+ StringPiece v, std::size_t p)
+ : type(t)
+ , value(v)
+ , pos(p) {
+}
+
+TreeFragmentTokenizer::TreeFragmentTokenizer()
+ : value_(TreeFragmentToken_EOS, "", -1) {
+}
+
+TreeFragmentTokenizer::TreeFragmentTokenizer(const StringPiece &s)
+ : str_(s)
+ , value_(TreeFragmentToken_EOS, "", -1)
+ , iter_(s.begin())
+ , end_(s.end())
+ , pos_(0) {
+ ++(*this);
+}
+
+TreeFragmentTokenizer &TreeFragmentTokenizer::operator++() {
+ while (iter_ != end_ && (*iter_ == ' ' || *iter_ == '\t')) {
+ ++iter_;
+ ++pos_;
+ }
+
+ if (iter_ == end_) {
+ value_ = TreeFragmentToken(TreeFragmentToken_EOS, "", pos_);
+ return *this;
+ }
+
+ if (*iter_ == '[') {
+ value_ = TreeFragmentToken(TreeFragmentToken_LSB, "[", pos_);
+ ++iter_;
+ ++pos_;
+ } else if (*iter_ == ']') {
+ value_ = TreeFragmentToken(TreeFragmentToken_RSB, "]", pos_);
+ ++iter_;
+ ++pos_;
+ } else {
+ std::size_t start = pos_;
+ while (true) {
+ ++iter_;
+ ++pos_;
+ if (iter_ == end_ || *iter_ == ' ' || *iter_ == '\t') {
+ break;
+ }
+ if (*iter_ == '[' || *iter_ == ']') {
+ break;
+ }
+ }
+ StringPiece word = str_.substr(start, pos_-start);
+ value_ = TreeFragmentToken(TreeFragmentToken_WORD, word, start);
+ }
+
+ return *this;
+}
+
+TreeFragmentTokenizer TreeFragmentTokenizer::operator++(int) {
+ TreeFragmentTokenizer tmp(*this);
+ ++*this;
+ return tmp;
+}
+
+bool operator==(const TreeFragmentTokenizer &lhs,
+ const TreeFragmentTokenizer &rhs) {
+ if (lhs.value_.type == TreeFragmentToken_EOS ||
+ rhs.value_.type == TreeFragmentToken_EOS) {
+ return lhs.value_.type == TreeFragmentToken_EOS &&
+ rhs.value_.type == TreeFragmentToken_EOS;
+ }
+ return lhs.iter_ == rhs.iter_;
+}
+
+bool operator!=(const TreeFragmentTokenizer &lhs,
+ const TreeFragmentTokenizer &rhs) {
+ return !(lhs == rhs);
+}
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tree_fragment_tokenizer.h b/phrase-extract/syntax-common/tree_fragment_tokenizer.h
new file mode 100644
index 000000000..ca8741a52
--- /dev/null
+++ b/phrase-extract/syntax-common/tree_fragment_tokenizer.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "util/string_piece.hh"
+
+namespace MosesTraining {
+namespace Syntax {
+
+enum TreeFragmentTokenType {
+ TreeFragmentToken_EOS,
+ TreeFragmentToken_LSB,
+ TreeFragmentToken_RSB,
+ TreeFragmentToken_WORD
+};
+
+struct TreeFragmentToken {
+ public:
+ TreeFragmentToken(TreeFragmentTokenType, StringPiece, std::size_t);
+ TreeFragmentTokenType type;
+ StringPiece value;
+ std::size_t pos;
+};
+
+// Tokenizes tree fragment strings in Moses format.
+//
+// For example, the string "[S [NP [NN weasels]] [VP]]" is tokenized to the
+// sequence:
+//
+// 1 LSB "["
+// 2 WORD "S"
+// 3 LSB "["
+// 4 WORD "NP"
+// 5 LSB "["
+// 6 WORD "NN"
+// 7 WORD "a"
+// 8 RSB "]"
+// 9 RSB "]"
+// 10 LSB "["
+// 11 WORD "VP"
+// 12 RSB "]"
+// 13 RSB "]"
+// 14 EOS undefined
+//
+class TreeFragmentTokenizer {
+ public:
+ TreeFragmentTokenizer();
+ TreeFragmentTokenizer(const StringPiece &);
+
+ const TreeFragmentToken &operator*() const { return value_; }
+ const TreeFragmentToken *operator->() const { return &value_; }
+
+ TreeFragmentTokenizer &operator++();
+ TreeFragmentTokenizer operator++(int);
+
+ friend bool operator==(const TreeFragmentTokenizer &,
+ const TreeFragmentTokenizer &);
+
+ friend bool operator!=(const TreeFragmentTokenizer &,
+ const TreeFragmentTokenizer &);
+
+ private:
+ StringPiece str_;
+ TreeFragmentToken value_;
+ StringPiece::const_iterator iter_;
+ StringPiece::const_iterator end_;
+ std::size_t pos_;
+};
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tree_fragment_tokenizer_test.cc b/phrase-extract/syntax-common/tree_fragment_tokenizer_test.cc
new file mode 100644
index 000000000..cd09c6911
--- /dev/null
+++ b/phrase-extract/syntax-common/tree_fragment_tokenizer_test.cc
@@ -0,0 +1,74 @@
+#include "tree_fragment_tokenizer.h"
+
+#define BOOST_TEST_MODULE TreeTest
+#include <boost/test/unit_test.hpp>
+
+#include <boost/scoped_ptr.hpp>
+
+namespace MosesTraining {
+namespace Syntax {
+namespace {
+
+BOOST_AUTO_TEST_CASE(tokenize_empty) {
+ const std::string fragment = "";
+ std::vector<TreeFragmentToken> tokens;
+ for (TreeFragmentTokenizer p(fragment); p != TreeFragmentTokenizer(); ++p) {
+ tokens.push_back(*p);
+ }
+ BOOST_REQUIRE(tokens.empty());
+}
+
+BOOST_AUTO_TEST_CASE(tokenize_space) {
+ const std::string fragment = " [ weasel weasel ] [] ] wea[sel";
+ std::vector<TreeFragmentToken> tokens;
+ for (TreeFragmentTokenizer p(fragment); p != TreeFragmentTokenizer(); ++p) {
+ tokens.push_back(*p);
+ }
+ BOOST_REQUIRE(tokens.size() == 10);
+ BOOST_REQUIRE(tokens[0].type == TreeFragmentToken_LSB);
+ BOOST_REQUIRE(tokens[0].value == "[");
+ BOOST_REQUIRE(tokens[1].type == TreeFragmentToken_WORD);
+ BOOST_REQUIRE(tokens[1].value == "weasel");
+ BOOST_REQUIRE(tokens[2].type == TreeFragmentToken_WORD);
+ BOOST_REQUIRE(tokens[2].value == "weasel");
+ BOOST_REQUIRE(tokens[3].type == TreeFragmentToken_RSB);
+ BOOST_REQUIRE(tokens[3].value == "]");
+ BOOST_REQUIRE(tokens[4].type == TreeFragmentToken_LSB);
+ BOOST_REQUIRE(tokens[4].value == "[");
+ BOOST_REQUIRE(tokens[5].type == TreeFragmentToken_RSB);
+ BOOST_REQUIRE(tokens[5].value == "]");
+ BOOST_REQUIRE(tokens[6].type == TreeFragmentToken_RSB);
+ BOOST_REQUIRE(tokens[6].value == "]");
+ BOOST_REQUIRE(tokens[7].type == TreeFragmentToken_WORD);
+ BOOST_REQUIRE(tokens[7].value == "wea");
+ BOOST_REQUIRE(tokens[8].type == TreeFragmentToken_LSB);
+ BOOST_REQUIRE(tokens[8].value == "[");
+ BOOST_REQUIRE(tokens[9].type == TreeFragmentToken_WORD);
+ BOOST_REQUIRE(tokens[9].value == "sel");
+}
+
+BOOST_AUTO_TEST_CASE(tokenize_fragment) {
+ const std::string fragment = "[S [NP [NN weasels]] [VP]]";
+ std::vector<TreeFragmentToken> tokens;
+ for (TreeFragmentTokenizer p(fragment); p != TreeFragmentTokenizer(); ++p) {
+ tokens.push_back(*p);
+ }
+ BOOST_REQUIRE(tokens.size() == 13);
+ BOOST_REQUIRE(tokens[0].type == TreeFragmentToken_LSB);
+ BOOST_REQUIRE(tokens[1].type == TreeFragmentToken_WORD);
+ BOOST_REQUIRE(tokens[2].type == TreeFragmentToken_LSB);
+ BOOST_REQUIRE(tokens[3].type == TreeFragmentToken_WORD);
+ BOOST_REQUIRE(tokens[4].type == TreeFragmentToken_LSB);
+ BOOST_REQUIRE(tokens[5].type == TreeFragmentToken_WORD);
+ BOOST_REQUIRE(tokens[6].type == TreeFragmentToken_WORD);
+ BOOST_REQUIRE(tokens[7].type == TreeFragmentToken_RSB);
+ BOOST_REQUIRE(tokens[8].type == TreeFragmentToken_RSB);
+ BOOST_REQUIRE(tokens[9].type == TreeFragmentToken_LSB);
+ BOOST_REQUIRE(tokens[10].type == TreeFragmentToken_WORD);
+ BOOST_REQUIRE(tokens[11].type == TreeFragmentToken_RSB);
+ BOOST_REQUIRE(tokens[12].type == TreeFragmentToken_RSB);
+}
+
+} // namespace
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tree_test.cc b/phrase-extract/syntax-common/tree_test.cc
new file mode 100644
index 000000000..0a54ad3f1
--- /dev/null
+++ b/phrase-extract/syntax-common/tree_test.cc
@@ -0,0 +1,66 @@
+#include "tree.h"
+
+#define BOOST_TEST_MODULE TreeTest
+#include <boost/test/unit_test.hpp>
+
+#include <boost/scoped_ptr.hpp>
+
+namespace MosesTraining {
+namespace Syntax {
+namespace {
+
+// Test Tree<>::PreOrderIterator with a trivial, single-node tree.
+BOOST_AUTO_TEST_CASE(pre_order_1) {
+ boost::scoped_ptr<Tree<int> > root(new Tree<int>(123));
+ Tree<int>::PreOrderIterator p(*root);
+ BOOST_REQUIRE(p != Tree<int>::PreOrderIterator());
+ BOOST_REQUIRE(p->value() == 123);
+ ++p;
+ BOOST_REQUIRE(p == Tree<int>::PreOrderIterator());
+}
+
+// Test Tree<>::PreOrderIterator on this tree: (1 (2 3) (4) (5 6 (7 8)))
+BOOST_AUTO_TEST_CASE(pre_order_2) {
+ boost::scoped_ptr<Tree<int> > root(new Tree<int>(1));
+ root->children().push_back(new Tree<int>(2));
+ root->children()[0]->children().push_back(new Tree<int>(3));
+ root->children().push_back(new Tree<int>(4));
+ root->children().push_back(new Tree<int>(5));
+ root->children()[2]->children().push_back(new Tree<int>(6));
+ root->children()[2]->children().push_back(new Tree<int>(7));
+ root->children()[2]->children()[1]->children().push_back(new Tree<int>(8));
+ root->SetParents();
+
+ Tree<int>::PreOrderIterator p(*root);
+ Tree<int>::PreOrderIterator end;
+
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 1);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 2);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 3);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 4);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 5);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 6);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 7);
+ ++p;
+ BOOST_REQUIRE(p != end);
+ BOOST_REQUIRE(p->value() == 8);
+ ++p;
+ BOOST_REQUIRE(p == end);
+}
+
+} // namespace
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
new file mode 100644
index 000000000..c4363a3e2
--- /dev/null
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -0,0 +1,59 @@
+#include "xml_tree_parser.h"
+
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+#include <cassert>
+#include <vector>
+
+namespace MosesTraining {
+namespace Syntax {
+
+StringTree *XmlTreeParser::Parse(const std::string &line) {
+ line_ = line;
+ tree_.Clear();
+ try {
+ if (!ProcessAndStripXMLTags(line_, tree_, label_set_, top_label_set_,
+ false)) {
+ throw Exception("");
+ }
+ } catch (const XmlException &e) {
+ throw Exception(e.getMsg());
+ }
+ tree_.ConnectNodes();
+ SyntaxNode *root = tree_.GetTop();
+ assert(root);
+ words_ = tokenize(line_.c_str());
+ return ConvertTree(*root, words_);
+}
+
+// Converts a SyntaxNode tree to a StringTree.
+StringTree *XmlTreeParser::ConvertTree(const SyntaxNode &tree,
+ const std::vector<std::string> &words) {
+ StringTree *root = new StringTree(tree.GetLabel());
+ const std::vector<SyntaxNode*> &children = tree.GetChildren();
+ if (children.empty()) {
+ if (tree.GetStart() != tree.GetEnd()) {
+ std::ostringstream msg;
+ msg << "leaf node covers multiple words (" << tree.GetStart()
+ << "-" << tree.GetEnd() << "): this is currently unsupported";
+ throw Exception(msg.str());
+ }
+ StringTree *leaf = new StringTree(words[tree.GetStart()]);
+ leaf->parent() = root;
+ root->children().push_back(leaf);
+ } else {
+ for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ assert(*p);
+ StringTree *child = ConvertTree(**p, words);
+ child->parent() = root;
+ root->children().push_back(child);
+ }
+ }
+ return root;
+}
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
new file mode 100644
index 000000000..a5563f63a
--- /dev/null
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "SyntaxTree.h"
+
+#include "exception.h"
+#include "string_tree.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+// Parses a string in Moses' XML parse tree format and returns a StringTree
+// object. This is a wrapper around the ProcessAndStripXMLTags function.
+class XmlTreeParser {
+ public:
+ StringTree *Parse(const std::string &);
+
+ private:
+ static StringTree *ConvertTree(const MosesTraining::SyntaxNode &,
+ const std::vector<std::string> &);
+
+ std::set<std::string> label_set_;
+ std::map<std::string, int> top_label_set_;
+ std::string line_;
+ MosesTraining::SyntaxTree tree_;
+ std::vector<std::string> words_;
+};
+
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/tables-core.h b/phrase-extract/tables-core.h
index e239e5900..9662ced2a 100644
--- a/phrase-extract/tables-core.h
+++ b/phrase-extract/tables-core.h
@@ -27,7 +27,7 @@ public:
std::vector< WORD > vocab;
WORD_ID storeIfNew( const WORD& );
WORD_ID getWordID( const WORD& );
- inline WORD &getWord( WORD_ID id ) {
+ inline WORD &getWord( const WORD_ID id ) {
return vocab[ id ];
}
};
diff --git a/regression-testing/Jamfile b/regression-testing/Jamfile
index 78349ea2c..b2ba7cce1 100644
--- a/regression-testing/Jamfile
+++ b/regression-testing/Jamfile
@@ -25,7 +25,7 @@ if $(with-regtest) {
$(TOP)/regression-testing/run-single-test.perl --decoder=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
}
reg_test phrase : [ glob $(test-dir)/phrase.* ] : ../moses-cmd//moses : @reg_test_decode ;
- reg_test chart : [ glob $(test-dir)/chart.* ] : ../moses-chart-cmd//moses_chart : @reg_test_decode ;
+ reg_test chart : [ glob $(test-dir)/chart.* ] : ../moses-cmd//moses : @reg_test_decode ;
actions reg_test_score {
$(TOP)/regression-testing/run-test-scorer.perl --scorer=$(>) --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<)
diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl
index 2d2427bc5..ae5a386fa 100755
--- a/scripts/OSM/OSM-Train.perl
+++ b/scripts/OSM/OSM-Train.perl
@@ -190,3 +190,22 @@ sub open_or_zcat {
open($hdl,$read) or die "Can't read $fn ($read)";
return $hdl;
}
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+} \ No newline at end of file
diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl
index 8aca3460d..69fd8bf46 100755
--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@@ -21,12 +21,12 @@ die("ERROR: wrong syntax when invoking postDecodingTransliteration.perl")
'transliteration-model-dir=s' => \$TRANSLIT_MODEL,
'input-extension=s' => \$INPUT_EXTENSION,
'output-extension=s' => \$OUTPUT_EXTENSION,
- 'decoder=s' => \$DECODER,
+ 'decoder=s' => \$DECODER,
'oov-file=s' => \$OOV_FILE,
'input-file=s' => \$INPUT_FILE,
'output-file=s' => \$OUTPUT_FILE,
'verbose' => \$VERBOSE,
- 'language-model=s' => \$LM_FILE);
+ 'language-model=s' => \$LM_FILE);
# check if the files are in place
die("ERROR: you need to define --moses-src-dir --external-bin-dir, --transliteration-model-dir, --oov-file, --output-file --input-extension, --output-extension, and --language-model")
@@ -38,6 +38,11 @@ die("ERROR: you need to define --moses-src-dir --external-bin-dir, --translitera
defined($INPUT_FILE)&&
defined($EXTERNAL_BIN_DIR)&&
defined($LM_FILE));
+if (! -e $LM_FILE) {
+ my $LM_FILE_WORD = `ls $LM_FILE*word*`;
+ chop($LM_FILE_WORD);
+ $LM_FILE = $LM_FILE_WORD if $LM_FILE_WORD ne "";
+}
die("ERROR: could not find Language Model '$LM_FILE'")
unless -e $LM_FILE;
die("ERROR: could not find Transliteration Model '$TRANSLIT_MODEL'")
diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl
index 355232222..7739e2a2b 100755
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@@ -13,7 +13,7 @@ print STDERR "Training Transliteration Module - Start\n".`date`;
my $ORDER = 5;
my $OUT_DIR = "/tmp/Transliteration-Model.$$";
my $___FACTOR_DELIMITER = "|";
-my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$EXTERNAL_BIN_DIR,$INPUT_EXTENSION, $OUTPUT_EXTENSION, $SOURCE_SYNTAX, $TARGET_SYNTAX);
+my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$EXTERNAL_BIN_DIR,$INPUT_EXTENSION, $OUTPUT_EXTENSION, $SOURCE_SYNTAX, $TARGET_SYNTAX,$DECODER);
# utilities
my $ZCAT = "gzip -cd";
@@ -31,8 +31,9 @@ die("ERROR: wrong syntax when invoking train-transliteration-module.perl")
'factor=s' => \$FACTOR,
'srilm-dir=s' => \$SRILM_DIR,
'out-dir=s' => \$OUT_DIR,
- 'source-syntax' => \$SOURCE_SYNTAX,
- 'target-syntax' => \$TARGET_SYNTAX);
+ 'decoder=s' => \$DECODER,
+ 'source-syntax' => \$SOURCE_SYNTAX,
+ 'target-syntax' => \$TARGET_SYNTAX);
# check if the files are in place
die("ERROR: you need to define --corpus-e, --corpus-f, --alignment, --srilm-dir, --moses-src-dir --external-bin-dir, --input-extension and --output-extension")
@@ -48,8 +49,9 @@ die("ERROR: could not find input corpus file '$CORPUS_F'")
unless -e $CORPUS_F;
die("ERROR: could not find output corpus file '$CORPUS_E'")
unless -e $CORPUS_E;
-die("ERROR: could not find algnment file '$ALIGNMENT'")
+die("ERROR: could not find alignment file '$ALIGNMENT'")
unless -e $ALIGNMENT;
+$DECODER = "$MOSES_SRC_DIR/bin/moses" unless defined($DECODER);
`mkdir $OUT_DIR`;
@@ -184,7 +186,7 @@ sub train_transliteration_module{
`$MOSES_SRC_DIR/scripts/ems/support/substitute-filtered-tables.perl $OUT_DIR/tuning/filtered/moses.ini < $OUT_DIR/model/moses.ini > $OUT_DIR/tuning/moses.filtered.ini`;
- `$MOSES_SRC_DIR/scripts/training/mert-moses.pl $OUT_DIR/tuning/input $OUT_DIR/tuning/reference $MOSES_SRC_DIR/bin/moses $OUT_DIR/tuning/moses.filtered.ini --nbest 100 --working-dir $OUT_DIR/tuning/tmp --decoder-flags "-threads 16 -drop-unknown -v 0 -distortion-limit 0" --rootdir $MOSES_SRC_DIR/scripts -mertdir $MOSES_SRC_DIR/mert -threads=16 --no-filter-phrase-table`;
+ `$MOSES_SRC_DIR/scripts/training/mert-moses.pl $OUT_DIR/tuning/input $OUT_DIR/tuning/reference $DECODER $OUT_DIR/tuning/moses.filtered.ini --nbest 100 --working-dir $OUT_DIR/tuning/tmp --decoder-flags "-threads 16 -drop-unknown -v 0 -distortion-limit 0" --rootdir $MOSES_SRC_DIR/scripts -mertdir $MOSES_SRC_DIR/mert -threads=16 --no-filter-phrase-table`;
`cp $OUT_DIR/tuning/tmp/moses.ini $OUT_DIR/tuning/moses.ini`;
diff --git a/scripts/analysis/oov.pl b/scripts/analysis/oov.pl
index 85ddbb0fe..15261c410 100755
--- a/scripts/analysis/oov.pl
+++ b/scripts/analysis/oov.pl
@@ -5,6 +5,8 @@
use strict;
use warnings;
+use Digest::MD5 qw(md5);
+use Encode qw(encode_utf8);
use Getopt::Long;
binmode(STDIN, ":utf8");
@@ -14,9 +16,11 @@ binmode(STDERR, ":utf8");
my $verbose = 0;
my $n = 1;
my $srcfile = undef;
+my $md5 = 0;
GetOptions(
"n=i" => \$n, # the n-grams to search for (default: unigrams)
- "verbose" => \$verbose, # emit the list of oov words
+ "verbose!" => \$verbose, # emit the list of oov words
+ "md5!" => \$md5, # emit the list of oov words
"src=s" => \$srcfile, # use this source file
) or exit 1;
@@ -25,6 +29,8 @@ if (!defined $testf) {
print STDERR "usage: $0 test-corpus < training-corpus
Options:
--n=1 ... use phrases of n words as the unit
+ set --n=0 to compare *whole sentences* (forces md5 hashing on)
+ --md5 ... hash each ngram using md5, saves memory for longer n-grams
--verbose ... emit OOV phrases at the end
--src=test-src ... a word in the test-corpus not deemed OOV if present in the
corresponding source sentence in test-src.
@@ -39,6 +45,8 @@ Synopsis:
exit 1;
}
+my $ngr_or_sent = $n > 0 ? "$n-grams" : "sentences";
+
# load source file to accept ngrams from source
my $source_confirms = undef;
my $srcfilelen = undef;
@@ -51,7 +59,7 @@ if (defined $srcfile) {
chomp;
s/^\s+//;
s/\s+$//;
- my $ngrams = ngrams($n, [ split /\s+/, $_ ]);
+ my $ngrams = ngrams($n, $_);
foreach my $ngr (keys %$ngrams) {
$source_confirms->[$nr]->{$ngr} += $ngrams->{$ngr};
$srctokens += $ngrams->{$ngr};
@@ -59,7 +67,7 @@ if (defined $srcfile) {
}
close $fh;
print "Source set sents\t$nr\n";
- print "Source set running $n-grams\t$srctokens\n";
+ print "Source set running $ngr_or_sent\t$srctokens\n" if $n>0;
$srcfilelen = $nr;
}
@@ -73,7 +81,7 @@ while (<$fh>) {
chomp;
s/^\s+//;
s/\s+$//;
- my $ngrams = ngrams($n, [ split /\s+/, $_ ]);
+ my $ngrams = ngrams($n, $_);
foreach my $ngr (keys %$ngrams) {
$needed{$ngr} += $ngrams->{$ngr}
unless $source_confirms->[$nr]->{$ngr};
@@ -85,9 +93,9 @@ close $fh;
my $testtypesneeded = scalar(keys(%needed));
my $testtypes = scalar(keys(%testtypes));
print "Test set sents\t$nr\n";
-print "Test set running $n-grams\t$testtokens\n";
-print "Test set unique $n-grams needed\t$testtypesneeded\n";
-print "Test set unique $n-grams\t$testtypes\n";
+print "Test set running $n-grams\t$testtokens\n" if $n>0;
+print "Test set unique $ngr_or_sent needed\t$testtypesneeded\n";
+print "Test set unique $ngr_or_sent\t$testtypes\n";
die "Mismatching sent count: $srcfile and $testf ($srcfilelen vs. $nr)"
if defined $srcfile && $srcfilelen != $nr;
@@ -102,7 +110,7 @@ while (<>) {
chomp;
s/^\s+//;
s/\s+$//;
- my $ngrams = ngrams($n, [ split /\s+/, $_ ]);
+ my $ngrams = ngrams($n, $_); # [ split /\s+/, $_ ]);
foreach my $ngr (keys %$ngrams) {
$seen{$ngr} = 1 if $ngrams->{$ngr};
$traintokens += $ngrams->{$ngr};
@@ -114,8 +122,8 @@ foreach my $ngr (keys %needed) {
print STDERR "Done.\n";
my $traintypes = scalar(keys(%seen));
print "Training set sents\t$nr\n";
-print "Training set running $n-grams\t$traintokens\n";
-print "Training set unique $n-grams\t$traintypes\n";
+print "Training set running $n-grams\t$traintokens\n" if $n>0;
+print "Training set unique $ngr_or_sent\t$traintypes\n";
my $oovtypes = scalar(keys(%needed));
@@ -123,8 +131,8 @@ my $oovtokens = 0;
foreach my $v (values %needed) {
$oovtokens += $v;
}
-printf "OOV $n-gram types\t%i\t%.1f %%\n", $oovtypes, $oovtypes/$testtypes*100;
-printf "OOV $n-gram tokens\t%i\t%.1f %%\n", $oovtokens, $oovtokens/$testtokens*100;
+printf "OOV $ngr_or_sent types\t%i\t%.1f %%\n", $oovtypes, $oovtypes/$testtypes*100;
+printf "OOV $ngr_or_sent tokens\t%i\t%.1f %%\n", $oovtokens, $oovtokens/$testtokens*100;
if ($verbose) {
foreach my $ngr (sort {$needed{$b} <=> $needed{$a}} keys %needed) {
@@ -159,17 +167,26 @@ sub my_open {
sub ngrams {
my $n = shift;
- my @words = @{shift()};
- my $out;
- if ($n == 1) {
- foreach my $w (@words) {
- $out->{$w}++;
- }
+ my $sent = shift;
+
+ if ($n == 0) {
+ return { md5(encode_utf8($sent)) => 1 };
} else {
- while ($#words >= $n-1) {
- $out->{join(" ", @words[0..$n-1])}++;
- shift @words;
+ my @words = split /\s+/, $sent;
+ my $out;
+ if ($n == 1) {
+ foreach my $w (@words) {
+ my $usew = $md5 ? md5(encode_utf8($$w)) : $w;
+ $out->{$w}++;
+ }
+ } else {
+ while ($#words >= $n-1) {
+ my $ngr = join(" ", @words[0..$n-1]);
+ my $usengr = $md5 ? md5(encode_utf8($ngr)) : $ngr;
+ $out->{$ngr}++;
+ shift @words;
+ }
}
+ return $out;
}
- return $out;
}
diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic
index 1db8154f5..8421a8fa1 100644
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@@ -137,7 +137,7 @@ raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
### tool to be used for language model training
# kenlm training
lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz"
-settings = "--prune '0 0 1' -T $working-dir/lm/tmp -S 50%"
+settings = "--prune '0 0 1' -T $working-dir/lm -S 20%"
# srilm
#lm-training = $srilm-dir/ngram-count
@@ -299,6 +299,7 @@ script = $moses-script-dir/training/train-model.perl
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
+# * "-parallel" for parallel execution of mkcls and giza
#
#training-options = ""
@@ -371,6 +372,11 @@ alignment-symmetrization-method = grow-diag-final-and
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
+### build memory mapped suffix array phrase table
+# (binarizing the reordering table is a good idea, since filtering makes little sense)
+#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1"
+#binarize-all = $moses-script-dir/training/binarize-model.perl
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-bin-dir/biconcor
@@ -383,11 +389,17 @@ alignment-symmetrization-method = grow-diag-final-and
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
+#
+# if OSM training should be skipped, point to OSM Model
+#osm-model =
-### if OSM training should be skipped,
-# point to OSM Model
+### unsupervised transliteration module
+# Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
+# "Integrating an Unsupervised Transliteration Model
+# into Statistical Machine Translation."
#
-# osm-model =
+#transliteration-module = "yes"
+#post-decoding-transliteration = "yes"
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored
index c3a6b2a85..9aff587ff 100644
--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@@ -137,7 +137,7 @@ raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
### tool to be used for language model training
# kenlm training
lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz"
-settings = "--prune '0 0 1' -T $working-dir/lm/tmp -S 50%"
+settings = "--prune '0 0 1' -T $working-dir/lm -S 20%"
# srilm
#lm-training = $srilm-dir/ngram-count
@@ -319,6 +319,7 @@ script = $moses-script-dir/training/train-model.perl
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
+# * "-parallel" for parallel execution of mkcls and giza
#
#training-options = ""
@@ -391,6 +392,11 @@ alignment-symmetrization-method = grow-diag-final-and
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
+### build memory mapped suffix array phrase table
+# (binarizing the reordering table is a good idea, since filtering makes little sense)
+#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1"
+#binarize-all = $moses-script-dir/training/binarize-model.perl
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-bin-dir/biconcor
@@ -403,11 +409,17 @@ alignment-symmetrization-method = grow-diag-final-and
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
+#
+# if OSM training should be skipped, point to OSM Model
+#osm-model =
-### if OSM training should be skipped,
-# point to OSM Model
+### unsupervised transliteration module
+# Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
+# "Integrating an Unsupervised Transliteration Model
+# into Statistical Machine Translation."
#
-# osm-model =
+#transliteration-module = "yes"
+#post-decoding-transliteration = "yes"
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical
index 673ad64a9..9d47aa001 100644
--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@@ -137,7 +137,7 @@ raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
### tool to be used for language model training
# kenlm training
lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz"
-settings = "--prune '0 0 1' -T $working-dir/lm/tmp -S 50%"
+settings = "--prune '0 0 1' -T $working-dir/lm -S 20%"
# srilm
#lm-training = $srilm-dir/ngram-count
@@ -299,6 +299,7 @@ script = $moses-script-dir/training/train-model.perl
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
+# * "-parallel" for parallel execution of mkcls and giza
#
#training-options = ""
@@ -371,6 +372,11 @@ alignment-symmetrization-method = grow-diag-final-and
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
+### build memory mapped suffix array phrase table
+# (binarizing the reordering table is a good idea, since filtering makes little sense)
+#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1"
+#binarize-all = $moses-script-dir/training/binarize-model.perl
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-bin-dir/biconcor
@@ -383,11 +389,17 @@ alignment-symmetrization-method = grow-diag-final-and
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
+#
+# if OSM training should be skipped, point to OSM Model
+#osm-model =
-### if OSM training should be skipped,
-# point to OSM Model
+### unsupervised transliteration module
+# Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
+# "Integrating an Unsupervised Transliteration Model
+# into Statistical Machine Translation."
#
-# osm-model =
+#transliteration-module = "yes"
+#post-decoding-transliteration = "yes"
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax
index 7df60f990..d874e74c0 100644
--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@@ -141,7 +141,7 @@ raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
### tool to be used for language model training
# kenlm training
lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz"
-settings = "--prune '0 0 1' -T $working-dir/lm/tmp -S 50%"
+settings = "--prune '0 0 1' -T $working-dir/lm -S 20%"
# srilm
#lm-training = $srilm-dir/ngram-count
@@ -303,6 +303,7 @@ script = $moses-script-dir/training/train-model.perl
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
+# * "-parallel" for parallel execution of mkcls and giza
#
#training-options = ""
@@ -375,6 +376,11 @@ alignment-symmetrization-method = grow-diag-final-and
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
+### build memory mapped suffix array phrase table
+# (binarizing the reordering table is a good idea, since filtering makes little sense)
+#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1"
+#binarize-all = $moses-script-dir/training/binarize-model.perl
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-bin-dir/biconcor
@@ -387,11 +393,17 @@ alignment-symmetrization-method = grow-diag-final-and
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
+#
+# if OSM training should be skipped, point to OSM Model
+#osm-model =
-### if OSM training should be skipped,
-# point to OSM Model
+### unsupervised transliteration module
+# Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
+# "Integrating an Unsupervised Transliteration Model
+# into Statistical Machine Translation."
#
-# osm-model =
+#transliteration-module = "yes"
+#post-decoding-transliteration = "yes"
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy
index bd328a18e..195a89fa5 100644
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@@ -131,7 +131,7 @@ raw-stem = $toy-data/nc-5k
### tool to be used for language model training
# kenlm training
lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz"
-settings = "--prune '0 0 1' -T $working-dir/lm/tmp -S 50%"
+settings = "--prune '0 0 1' -T $working-dir/lm -S 20%"
# srilm
#lm-training = $srilm-dir/ngram-count
@@ -283,6 +283,7 @@ script = $moses-script-dir/training/train-model.perl
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
+# * "-parallel" for parallel execution of mkcls and giza
#
#training-options = ""
@@ -355,6 +356,11 @@ alignment-symmetrization-method = grow-diag-final-and
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
+### build memory mapped suffix array phrase table
+# (binarizing the reordering table is a good idea, since filtering makes little sense)
+#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1"
+#binarize-all = $moses-script-dir/training/binarize-model.perl
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-bin-dir/biconcor
@@ -367,11 +373,17 @@ alignment-symmetrization-method = grow-diag-final-and
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
#operation-sequence-model-settings = ""
+#
+# if OSM training should be skipped, point to OSM Model
+#osm-model =
-### if OSM training should be skipped,
-# point to OSM Model
+### unsupervised transliteration module
+# Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
+# "Integrating an Unsupervised Transliteration Model
+# into Statistical Machine Translation."
#
-# osm-model =
+#transliteration-module = "yes"
+#post-decoding-transliteration = "yes"
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index d9715a8c9..d45dc849a 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -22,6 +22,7 @@ clean
rerun-on-change: max-sentence-length $moses-script-dir/training/clean-corpus-n.perl
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 $max-sentence-length OUT.lines-retained
error: there is a blank factor
+ error: is too long! at
parse
in: clean-stem
out: parsed-stem
@@ -104,7 +105,7 @@ tokenize
train
in: tokenized
out: recase-config
- template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir OUT.model -corpus IN -scripts-root-dir $moses-script-dir -config OUT -ngram-count $lm-training
+ template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir OUT.model -corpus IN -scripts-root-dir $moses-script-dir -config OUT $recasing-settings
default-name: recasing/moses.ini
tmp-name: recasing/model
ignore-unless: EVALUATION:recaser
@@ -115,11 +116,14 @@ consolidate
in: CORPUS:clean-parsed-stem
out: tokenized-stem
default-name: truecaser/corpus
+ pass-unless: trainer
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
+ error: number of lines don't match
train
in: tokenized-stem
out: truecase-model
rerun-on-change: trainer
+ pass-unless: trainer
default-name: truecaser/truecase-model
template: $trainer -model OUT.$input-extension -corpus IN.$input-extension ; $trainer -model OUT.$output-extension -corpus IN.$output-extension
@@ -150,10 +154,15 @@ tokenize
pass-unless: output-tokenizer
template: $output-tokenizer < IN > OUT
parallelizable: yes
-factorize
+mock-parse
in: tokenized-corpus
+ out: mock-parsed-corpus
+ default-name: lm/mock-parsed
+ pass-unless: mock-output-parser-lm
+ template: $mock-output-parser-lm < IN > OUT
+factorize
+ in: mock-parsed-corpus
out: factorized-corpus
- rerun-on-change: TRAINING:output-factors
default-name: lm/factored
pass-unless: factors
parallelizable: yes
@@ -234,8 +243,14 @@ tokenize-tuning
pass-unless: output-tokenizer
template: $output-tokenizer < IN > OUT
parallelizable: yes
-factorize-tuning
+mock-parse-tuning
in: tokenized-tuning
+ out: mock-parsed-tuning
+ default-name: lm/interpolate-tuning.mock-parsed
+ pass-unless: mock-output-parser-lm
+ template: $mock-output-parser-lm < IN > OUT
+factorize-tuning
+ in: mock-parsed-tuning
out: factorized-tuning
default-name: lm/interpolate-tuning.factored
pass-unless: TRAINING:output-factors
@@ -265,7 +280,7 @@ split-tuning
template: $output-splitter -model IN1.$output-extension < IN > OUT
interpolate
in: script split-tuning LM:lm
- rerun-on-change: srilm-dir group
+ rerun-on-change: srilm-dir group weights
out: lm
default-name: lm/interpolated-lm
randomize
@@ -551,7 +566,6 @@ extract-phrases
rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm domain-features baseline-extract lexicalized-reordering
only-existence-matters: domain-features
default-name: model/extract
- ignore-if: suffix-array
build-reordering
in: extracted-phrases
out: reordering-table
@@ -564,7 +578,14 @@ build-ttable
out: phrase-translation-table
rerun-on-change: translation-factors hierarchical-rule-set score-settings training-options script EVALUATION:report-precision-by-coverage include-word-alignment-in-rules domain-features
default-name: model/phrase-table
- ignore-if: suffix-array
+ ignore-if: suffix-array mmsapt
+ final-model: yes
+build-mmsapt
+ in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
+ out: phrase-translation-table
+ ignore-unless: mmsapt
+ default-name: model/phrase-table-mmsapt
+ template: $moses-script-dir/training/build-mmsapt.perl --alignment IN.$alignment-symmetrization-method --corpus IN1 --f $input-extension --e $output-extension --dir OUT --settings '$mmsapt'
final-model: yes
sigtest-filter-suffix-array
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
@@ -622,8 +643,8 @@ build-sparse
create-config
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
out: config
- ignore-if: use-hiero
- rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
+ ignore-if: use-hiero thot
+ rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt
default-name: model/moses.ini
error: Unknown option
final-model: yes
@@ -679,6 +700,18 @@ hiero-create-config
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors
default-name: hiero-model/hiero.ini
template: $hiero-util-dir/generate-ini.pl IN IN1 IN2 IN3 IN4 IN5 $hiero-max-phrase-length $hiero-max-nonterminals $hiero-max-phrase-span $hiero-min-gap-length $hiero-freq-rank1 $hiero-freq-rank2 < $GENERAL:hiero-template-ini > OUT
+thot-build-ttable
+ in: corpus
+ out: thot-ttable
+ default-name: model/phrase-table-thot
+ rerun-on-change: input-extension output-extension
+ template: $thot/thot_tm_train -sdir $working-dir -s IN.$input-extension -t IN.$output-extension -o OUT
+thot-create-config
+ in: thot-ttable LM:lm
+ out: config
+ ignore-unless: thot
+ default-name: model/thot.ini
+ template: $thot/thot_gen_cfg_file IN1/lm_desc IN/tm_desc > OUT
[TUNING] single
input-from-sgm
@@ -705,17 +738,32 @@ tokenize-input-devtest
pass-unless: input-tokenizer
ignore-unless: use-mira
template: $input-tokenizer < IN > OUT
-parse-input
+mock-parse-input
in: tokenized-input
+ out: mock-parsed-input
+ default-name: tuning/input.mock-parsed
+ pass-unless: mock-input-parser-devtesteval
+ template: $mock-input-parser-devtesteval < IN > OUT
+mock-parse-input-devtest
+ in: tokenized-input-devtest
+ out: mock-parsed-input-devtest
+ default-name: tuning/input.devtest.mock-parsed
+ pass-unless: mock-input-parser-devtesteval
+ ignore-unless: use-mira
+ template: $mock-input-parser-devtesteval < IN > OUT
+parse-input
+ in: mock-parsed-input
out: parsed-input
default-name: tuning/input.parsed
pass-unless: input-parser
+ pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
template: $input-parser < IN > OUT
parse-input-devtest
- in: tokenized-input-devtest
+ in: mock-parsed-input-devtesteval
out: parsed-input-devtest
default-name: tuning/input.devtest.parsed
pass-unless: input-parser
+ pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
ignore-unless: use-mira
template: $input-parser < IN > OUT
parse-relax-input
@@ -723,14 +771,16 @@ parse-relax-input
out: parse-relaxed-input
default-name: tuning/input.parse-relaxed
pass-unless: input-parse-relaxer
- template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
+ pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
+ template: $input-parse-relaxer < IN > OUT
parse-relax-input-devtest
in: parsed-input-devtest
out: parse-relaxed-input-devtest
default-name: tuning/input.devtest.parse-relaxed
pass-unless: input-parse-relaxer
+ pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
ignore-unless: use-mira
- template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
+ template: $input-parse-relaxer < IN > OUT
factorize-input
in: parse-relaxed-input
out: factorized-input
@@ -832,8 +882,20 @@ tokenize-reference-devtest
ignore-unless: use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-tokenizer < IN > OUT
-lowercase-reference
+mock-parse-reference
in: tokenized-reference
+ out: mock-parsed-reference
+ default-name: tuning/reference.mock-parsed
+ pass-unless: mock-output-parser-references
+ template: $mock-output-parser-references < IN > OUT
+mock-parse-reference-devtest
+ in: tokenized-input-devtest
+ out: mock-parsed-reference-devtest
+ default-name: tuning/reference.devtest.mock-parsed
+ pass-unless: mock-output-parser-references
+ template: $mock-output-parser-references < IN > OUT
+lowercase-reference
+ in: mock-parsed-reference
out: truecased-reference
default-name: tuning/reference.lc
pass-unless: output-lowercaser
@@ -841,7 +903,7 @@ lowercase-reference
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
lowercase-reference-devtest
- in: tokenized-reference-devtest
+ in: mock-parsed-reference-devtest
out: truecased-reference-devtest
default-name: tuning/reference.devtest.lc
pass-unless: output-lowercaser
@@ -850,7 +912,7 @@ lowercase-reference-devtest
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
truecase-reference
- in: tokenized-reference TRUECASER:truecase-model
+ in: mock-parsed-reference TRUECASER:truecase-model
out: truecased-reference
rerun-on-change: output-truecaser
default-name: tuning/reference.tc
@@ -858,7 +920,7 @@ truecase-reference
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-truecaser -model IN1.$output-extension < IN > OUT
truecase-reference-devtest
- in: tokenized-reference-devtest TRUECASER:truecase-model
+ in: mock-parsed-reference-devtest TRUECASER:truecase-model
out: truecased-reference-devtest
rerun-on-change: output-truecaser
default-name: tuning/reference.devtest.tc
@@ -918,10 +980,17 @@ tune
final-model: yes
rerun-on-change: decoder-settings tuning-settings nbest lambda async
not-error: trans: No such file or directory
+thot-tune
+ in: TRAINING:config input reference
+ out: config-with-reused-weights
+ ignore-unless: thot
+ tmp-name: tuning/thot.tmp
+ default-name: tuning/thot.tuned.ini
+ template: mkdir -p TMP/home ; mkdir -p TMP/tdir ; mkdir -p TMP/sdir ; HOME=TMP/home $thot/thot_smt_tune -tdir TMP/tdir -sdir TMP/sdir -c IN -s IN1 -t IN2 -o OUT
apply-weights
in: TRAINING:bin-config weight-config
out: config-with-reused-weights
- ignore-if: use-hiero
+ ignore-if: use-hiero thot
default-name: tuning/moses.tuned.ini
template: $moses-script-dir/ems/support/substitute-weights.perl IN IN1 OUT
error: cannot open
@@ -959,18 +1028,26 @@ tokenize-input
default-name: evaluation/input.tok
pass-unless: input-tokenizer
template: $input-tokenizer < IN > OUT
-parse-input
+mock-parse-input
in: tokenized-input
+ out: mock-parsed-input
+ default-name: evaluation/input.mock-parsed
+ pass-unless: mock-input-parser-devtesteval
+ template: $mock-input-parser-devtesteval < IN > OUT
+parse-input
+ in: mock-parsed-input
out: parsed-input
default-name: evaluation/input.parsed
pass-unless: input-parser
+ pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
template: $input-parser < IN > OUT
parse-relax-input
in: parsed-input
out: parse-relaxed-input
- default-name: tuning/input.parse-relaxed
+ default-name: evaluation/input.parse-relaxed
pass-unless: input-parse-relaxer
- template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
+ pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
+ template: $input-parse-relaxer < IN > OUT
factorize-input
in: parse-relaxed-input
out: factorized-input
@@ -1020,15 +1097,15 @@ apply-filter
in: filtered-dir TRAINING:config TUNING:config-with-reused-weights
out: filtered-config
default-name: evaluation/filtered.ini
- ignore-if: TRAINING:binarize-all
+ ignore-if: TRAINING:binarize-all thot
template: $moses-script-dir/ems/support/substitute-filtered-tables-and-weights.perl IN/moses.ini IN1 IN2 OUT
decode
in: TUNING:config-with-reused-weights input filtered-config
out: system-output
default-name: evaluation/output
qsub-script: yes
- ignore-if: use-hiero
- rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade
+ ignore-if: use-hiero thot
+ rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade TRAINING:post-decoding-transliteration
error: Translation was not performed correctly
not-error: trans: No such file or directory
final-model: yes
@@ -1040,6 +1117,20 @@ hiero-decode
ignore-unless: use-hiero
template: $hiero-parallelizer -e OUT.edir -r -- $hiero-decoder -c IN < IN1 > OUT
rerun-on-change: hiero-decoder
+thot-filter
+ in: TUNING:config-with-reused-weights input
+ out: filtered-config
+ ignore-unless: thot
+ default-name: evaluation/filtered
+ tmp-name: evaluation/filtered-tmp
+ template: mkdir -p TMP/home ; mkdir -p TMP/tdir ; mkdir -p TMP/sdir ; HOME=TMP/home $thot/thot_prepare_sys_for_test -sdir TMP/sdir -tdir TMP/tdir -t IN1 -c IN/tuned_for_dev.cfg -o OUT ; cp OUT/lm/main/* OUT/lm
+thot-decode
+ in: input filtered-config
+ out: system-output
+ ignore-unless: thot
+ default-name: evaluation/output
+ template: $thot/thot_decoder -sdir $working-dir -c IN1/test_specific.cfg -t IN > OUT
+ not-error: Error in word penalty model file
remove-markup
in: system-output
out: cleaned-output
@@ -1093,8 +1184,14 @@ tokenize-reference
pass-unless: output-tokenizer
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-tokenizer < IN > OUT
-lowercase-reference
+mock-parse-reference
in: tokenized-reference
+ out: mock-parsed-reference
+ default-name: evaluation/reference.mock-parsed
+ pass-unless: mock-output-parser-references
+ template: $mock-output-parser-references < IN > OUT
+lowercase-reference
+ in: mock-parsed-reference
out: reference
default-name: evaluation/reference
pass-unless: output-lowercaser
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 3f4e53f23..1e47bb6b9 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -101,7 +101,7 @@ $VERSION = $DELETE_VERSION if $DELETE_VERSION;
`mkdir -p steps/$VERSION` unless -d "steps/$VERSION";
&log_config() unless $DELETE_CRASHED || $DELETE_VERSION;
-print "running experimenal run number $VERSION\n";
+print "running experimental run number $VERSION\n";
print "\nESTABLISH WHICH STEPS NEED TO BE RUN\n";
my (%NEEDED, # mapping of input files to step numbers
@@ -281,6 +281,7 @@ sub read_meta {
$escaped_template =~ s/^IN/EMS_IN_EMS/;
$escaped_template =~ s/ IN(\d*)/ EMS_IN$1_EMS/g;
$escaped_template =~ s/ OUT/ EMS_OUT_EMS/g;
+ $escaped_template =~ s/TMP/EMS_TMP_EMS/g;
$TEMPLATE{"$module:$step"} = $escaped_template;
}
elsif ($1 eq "template-if") {
@@ -288,6 +289,7 @@ sub read_meta {
$escaped_template =~ s/^IN/EMS_IN_EMS/;
$escaped_template =~ s/ IN(\d*)/ EMS_IN$1_EMS/g;
$escaped_template =~ s/ OUT/ EMS_OUT_EMS/g;
+ $escaped_template =~ s/TMP/EMS_TMP_EMS/g;
my @IF = split(/\s+/,$escaped_template);
push @{$TEMPLATE_IF{"$module:$step"}}, \@IF;
}
@@ -450,7 +452,9 @@ sub find_steps {
}
# go through each module
- for(my $m=$#MODULE; $m>=0; $m--) {
+ while(1) {
+ my $step_count_before = scalar(@DO_STEP);
+ for(my $m=$#MODULE; $m>=0; $m--) {
my $module = $MODULE[$m];
# if module is "multiple" go through each set
@@ -475,6 +479,8 @@ sub find_steps {
&find_steps_for_module($module,"");
}
}
+ last if $step_count_before == scalar(@DO_STEP);
+ }
}
sub find_steps_for_module {
@@ -487,6 +493,7 @@ sub find_steps_for_module {
my $step = &construct_name($module,$set,$stepname);
my $defined_step = &defined_step($step); # without set
+ next if defined($STEP_LOOKUP{$step});
# FIRST, some checking...
print "\tchecking step: $step\n" if $VERBOSE;
@@ -716,9 +723,11 @@ sub delete_crashed {
for(my $i=0;$i<=$#DO_STEP;$i++) {
my $step_file = &versionize(&step_file($i),$DELETE_CRASHED);
next unless -e $step_file;
- next unless &check_if_crashed($i,$DELETE_CRASHED,"no wait");
- &delete_step($DO_STEP[$i],$DELETE_CRASHED);
- $crashed++;
+ if (! -e $step_file.".DONE" || # interrupted (machine went down)
+ &check_if_crashed($i,$DELETE_CRASHED,"no wait")) { # noted crash
+ &delete_step($DO_STEP[$i],$DELETE_CRASHED);
+ $crashed++;
+ }
}
print "run with -exec to delete steps\n" if $crashed && !$EXECUTE;
print "nothing to do\n" unless $crashed;
@@ -813,7 +822,6 @@ sub delete_output {
if (-d $file) {
print "\tdelete directory $file\n";
`rm -r $file` if $EXECUTE;
- return;
}
# delete regular file that matches exactly
if (-e $file) {
@@ -821,11 +829,20 @@ sub delete_output {
`rm $file` if $EXECUTE;
}
# delete files that have additional extension
+ $file =~ /^(.+)\/([^\/]+)$/;
+ my ($dir,$f) = ($1,$2);
my @FILES = `ls $file.* 2>/dev/null`;
- foreach (@FILES) {
+ foreach (`ls $dir`) {
chop;
- print "\tdelete file $_\n";
- `rm $_` if $EXECUTE;
+ next unless substr($_,0,length($f)) eq $f;
+ if (-e $_) {
+ print "\tdelete file $dir/$_\n";
+ `rm $dir/$_` if $EXECUTE;
+ }
+ else {
+ print "\tdelete directory $dir/$_\n";
+ `rm -r $dir/$_` if $EXECUTE;
+ }
}
}
@@ -1513,9 +1530,9 @@ sub check_if_crashed {
'error','killed','core dumped','can\'t read',
'no such file or directory','unknown option',
'died at','exit code','permission denied',
- 'segmentation fault','abort',
- 'no space left on device',
- 'can\'t locate', 'unrecognized option', 'Exception') {
+ 'segmentation fault','abort',
+ 'no space left on device', ': not found',
+ 'can\'t locate', 'unrecognized option', 'Exception') {
if (/$pattern/i) {
my $not_error = 0;
if (defined($NOT_ERROR{&defined_step_id($i)})) {
@@ -1535,7 +1552,6 @@ sub check_if_crashed {
# check if output file empty
my $output = &get_default_file(&deconstruct_name($DO_STEP[$i]));
- print STDERR "".$DO_STEP[$i]." -> $output\n";
# currently only works for single output file
if (-e $output && -z $output) {
push @DIGEST,"output file $output is empty";
@@ -1861,7 +1877,7 @@ sub define_tuning_tune {
$cmd .= " --lambdas \"$lambda\"" if $lambda;
$cmd .= " --continue" if $tune_continue;
$cmd .= " --skip-decoder" if $skip_decoder;
- $cmd .= " --inputtype $tune_inputtype" if $tune_inputtype;
+ $cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype);
my $qsub_args = &get_qsub_args("TUNING");
$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
@@ -2152,13 +2168,14 @@ sub define_training_build_transliteration_model {
my ($model, $corpus, $alignment) = &get_output_and_input($step_id);
- my $moses_script_dir = &check_and_get("GENERAL:moses-script-dir");
- my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
- my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
- my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method");
- my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
- my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir");
- my $srilm_dir = &check_and_get("GENERAL:srilm-dir");
+ my $moses_script_dir = &check_and_get("GENERAL:moses-script-dir");
+ my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
+ my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
+ my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method");
+ my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
+ my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir");
+ my $srilm_dir = &check_and_get("TRAINING:srilm-dir");
+ my $decoder = &get("TRAINING:transliteration-decoder");
my $cmd = "$moses_script_dir/Transliteration/train-transliteration-module.pl";
$cmd .= " --corpus-f $corpus.$input_extension";
@@ -2166,6 +2183,7 @@ sub define_training_build_transliteration_model {
$cmd .= " --alignment $alignment.$sym_method";
$cmd .= " --out-dir $model";
$cmd .= " --moses-src-dir $moses_src_dir";
+ $cmd .= " --decoder $decoder" if defined($decoder);
$cmd .= " --external-bin-dir $external_bin_dir";
$cmd .= " --srilm-dir $srilm_dir";
$cmd .= " --input-extension $input_extension";
@@ -2174,7 +2192,7 @@ sub define_training_build_transliteration_model {
$cmd .= " --source-syntax " if &get("GENERAL:input-parser");
$cmd .= " --target-syntax " if &get("GENERAL:output-parser");
- &create_step($step_id, $cmd);
+ &create_step($step_id, $cmd);
}
sub define_training_extract_phrases {
@@ -2194,9 +2212,10 @@ sub define_training_extract_phrases {
$cmd .= "-glue-grammar-file $glue_grammar_file ";
if (&get("GENERAL:output-parser") && (&get("TRAINING:use-unknown-word-labels") || &get("TRAINING:use-unknown-word-soft-matches"))) {
- my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model",""));
- $cmd .= "-unknown-word-label $unknown_word_label ";
+ my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model",""));
+ $cmd .= "-unknown-word-label $unknown_word_label ";
}
+
if (&get("GENERAL:output-parser") && &get("TRAINING:use-unknown-word-soft-matches")) {
my $unknown_word_soft_matches = &versionize(&long_file_name("unknown-word-soft-matches","model",""));
$cmd .= "-unknown-word-soft-matches $unknown_word_soft_matches ";
@@ -2209,6 +2228,16 @@ sub define_training_extract_phrases {
if (&get("TRAINING:ghkm-tree-fragments")) {
$cmd .= "-ghkm-tree-fragments ";
}
+
+ if (&get("TRAINING:ghkm-phrase-orientation")) {
+ $cmd .= "-ghkm-phrase-orientation ";
+ my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
+ $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
+ }
+
+ if (&get("TRAINING:ghkm-source-labels")) {
+ $cmd .= "-ghkm-source-labels ";
+ }
}
my $extract_settings = &get("TRAINING:extract-settings");
@@ -2241,6 +2270,16 @@ sub define_training_build_ttable {
if (&get("TRAINING:ghkm-tree-fragments")) {
$cmd .= "-ghkm-tree-fragments ";
}
+ if (&get("TRAINING:ghkm-phrase-orientation")) {
+ $cmd .= "-ghkm-phrase-orientation ";
+ my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
+ $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
+ }
+ if (&get("TRAINING:ghkm-source-labels")) {
+ $cmd .= "-ghkm-source-labels ";
+ my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
+ $cmd .= "-ghkm-source-labels-file $source_labels_file ";
+ }
}
&create_step($step_id,$cmd);
@@ -2349,6 +2388,15 @@ sub get_config_tables {
}
}
+ # memory mapped suffix array phrase table
+ my $mmsapt = &get("TRAINING:mmsapt");
+ if (defined($mmsapt)) {
+ $ptImpl = 11; # mmsapt
+ $mmsapt =~ s/num-features=(\d+) // || die("ERROR: mmsapt setting needs to set num-features");
+ $numFF = $1;
+ $cmd .= "-mmsapt '$mmsapt' ";
+ }
+
# additional settings for factored models
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $phrase_translation_table);
$cmd = trim($cmd);
@@ -2406,24 +2454,22 @@ sub define_training_create_config {
$cmd .= "-transliteration-phrase-table $transliteration_pt ";
}
- if($osm){
-
+ if ($osm) {
my $osm_settings = &get("TRAINING:operation-sequence-model-settings");
-
+ if ($osm_settings =~ /-factor *(\S+)/){
+ $cmd .= "-osm-model $osm/ -osm-setting $1 ";
+ }
+ else {
+ $cmd .= "-osm-model $osm/operationLM.bin ";
+ }
+ }
- if($osm_settings =~ /factor/){
-
- $cmd .= "-osm-model $osm/ ";
- my $find = "--factor";
- my $replace = "-osm-setting";
- $osm_settings =~ s/$find/$replace/g;
- $cmd .= "$osm_settings ";
- }
- else{
- $cmd .= "-osm-model $osm/operationLM.bin ";
- }
+ if (&get("TRAINING:ghkm-source-labels")) {
+ $cmd .= "-ghkm-source-labels ";
+ my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
+ $cmd .= "-ghkm-source-labels-file $source_labels_file ";
}
-
+
# sparse lexical features provide additional content for config file
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
@@ -2504,10 +2550,19 @@ sub define_interpolated_lm_interpolate {
$interpolation_script, $tuning, @LM) = &get_output_and_input($step_id);
my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
my $group = &get("INTERPOLATED-LM:group");
+ my $weights = &get("INTERPOLATED-LM:weights");
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir");
my $cmd = "";
+ my %WEIGHT;
+ if (defined($weights)) {
+ foreach (split(/ *, */,$weights)) {
+ /^ *(\S+) *= *(\S+)/ || die("ERROR: wrong interpolation weight specification $_ ($weights)");
+ $WEIGHT{$1} = $2;
+ }
+ }
+
# go through language models by factor and order
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
foreach my $factor (keys %{$ILM_SETS}) {
@@ -2516,11 +2571,18 @@ sub define_interpolated_lm_interpolate {
# get list of language model files
my $lm_list = "";
+ my $weight_list = "";
foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
my ($id,$set) = split(/ /,$id_set,2);
$lm_list .= $LM[$id].",";
+ if (defined($weights)) {
+ die("ERROR: no interpolation weight set for $factor:$order:$set (factor:order:set)")
+ unless defined($WEIGHT{"$factor:$order:$set"});
+ $weight_list .= $WEIGHT{"$factor:$order:$set"}.",";
+ }
}
chop($lm_list);
+ chop($weight_list);
# if grouping, identify position in list
my $numbered_string = "";
@@ -2561,6 +2623,7 @@ sub define_interpolated_lm_interpolate {
}
$cmd .= "$interpolation_script --tuning $factored_tuning --name $name --srilm $srilm_dir --lm $lm_list";
$cmd .= " --group \"$numbered_string\"" if defined($group);
+ $cmd .= " --weights \"$weight_list\"" if defined($weights);
$cmd .= "\n";
}
}
@@ -3234,6 +3297,7 @@ sub define_template {
# replace IN and OUT with %s
$single_cmd =~ s/EMS_IN_EMS\S*/\%s/;
$single_cmd =~ s/EMS_OUT_EMS\S*/\%s/;
+ $single_cmd =~ s/EMS_SLASH_OUT_EMS\S*/\%s/;
# build tmp
my $tmp_dir = $module;
$tmp_dir =~ tr/A-Z/a-z/;
@@ -3274,6 +3338,10 @@ sub define_template {
$cmd =~ s/EMS_IN_EMS/$INPUT[0]/g;
}
$cmd =~ s/EMS_OUT_EMS/$output/g;
+ if (defined($STEP_TMPNAME{"$module:$stepname"})) {
+ my $tmp = $dir."/".$STEP_TMPNAME{"$module:$stepname"}.".$VERSION";
+ $cmd =~ s/EMS_TMP_EMS/$tmp/g;
+ }
$cmd =~ s/VERSION/$VERSION/g;
print "\tcmd is $cmd\n" if $VERBOSE;
while ($cmd =~ /^([\S\s]*)\$\{([^\s\/\"\']+)\}([\S\s]*)$/ ||
@@ -3381,7 +3449,7 @@ sub check_backoff_and_get_array {
# the following two functions deal with getting information about
# files that are passed between steps. this are either specified
# in the meta file (default) or in the configuration file (here called
-# 'specified', in the step management refered to as 'given').
+# 'specified', in the step management referred to as 'given').
sub get_specified_or_default_file {
my ($specified_module,$specified_set,$specified_parameter,
@@ -3426,10 +3494,11 @@ sub get_default_file {
my $name = &construct_name($module,$set,$out);
return &check_backoff_and_get($name);
}
-# print "\t\tpassing $step -> ";
+# print "\t\tpassing $step\n";
$i = $DEPENDENCY[$i][0];
$step = $DO_STEP[$i];
# print "\t\tbacking off to $step\n";
+ ($default_module,$default_set,$default_step) = &deconstruct_name($step);
}
# get file name
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index a2f9580a9..be5b76a5e 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -745,37 +745,15 @@ sub hierarchical_segmentation {
open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!";
open(NODE,">$dir/node") or die "Cannot open: $!";
while(<TRACE>) {
- /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
- /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_");
- my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
- if ($last_sentence >= 0 && $sentence != $last_sentence) {
- &hs_process($last_sentence,\@DERIVATION,\%STATS);
- @DERIVATION = ();
- }
- my %ITEM;
- $ITEM{'start'} = $start;
- $ITEM{'end'} = $end;
- $ITEM{'rule_lhs'} = $rule_lhs;
-
- $rule_rhs =~ s/</&lt;/g;
- $rule_rhs =~ s/>/&gt;/g;
- @{$ITEM{'rule_rhs'}} = split(/ /,$rule_rhs);
-
- foreach (split(/ /,$alignment)) {
- /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
- $ITEM{'alignment'}{$2} = $1; # target non-terminal to source span
- $ITEM{'alignedSpan'}{$1} = 1;
- }
-
- @{$ITEM{'spans'}} = ();
- foreach my $span (reverse split(/\s+/,$spans)) {
- $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n");
- my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
- push @{$ITEM{'spans'}}, \%SPAN;
- }
-
- push @DERIVATION,\%ITEM;
- $last_sentence = $sentence;
+ my $sentence;
+ my %ITEM;
+ &hs_scan_line($_, \$sentence, \%ITEM) || die("cannot scan line $_");
+ if ($last_sentence >= 0 && $sentence != $last_sentence) {
+ &hs_process($last_sentence,\@DERIVATION,\%STATS);
+ @DERIVATION = ();
+ }
+ push @DERIVATION,\%ITEM;
+ $last_sentence = $sentence;
}
&hs_process($last_sentence,\@DERIVATION,\%STATS);
close(TRACE);
@@ -793,6 +771,84 @@ sub hierarchical_segmentation {
close(SUMMARY);
}
+# scan a single line of the trace file
+sub hs_scan_line {
+ my ($line,$ref_sentence,$ref_item) = @_;
+
+ if ($line =~ /^Trans Opt/) {
+ # Old format
+ $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
+ $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || return 0;
+ my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
+
+ ${$ref_sentence} = $sentence;
+
+ $ref_item->{'start'} = $start;
+ $ref_item->{'end'} = $end;
+ $ref_item->{'rule_lhs'} = $rule_lhs;
+
+ $rule_rhs =~ s/</&lt;/g;
+ $rule_rhs =~ s/>/&gt;/g;
+ @{$ref_item->{'rule_rhs'}} = split(/ /,$rule_rhs);
+
+ foreach (split(/ /,$alignment)) {
+ /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
+ $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
+ $ref_item->{'alignedSpan'}{$1} = 1;
+ }
+
+ @{$ref_item->{'spans'}} = ();
+ foreach my $span (reverse split(/\s+/,$spans)) {
+ $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n");
+ my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
+ push @{$ref_item->{'spans'}}, \%SPAN;
+ }
+ } else {
+ # New format
+ $line =~ /^(\d+) \|\|\| \[\S+\] -> (.+) \|\|\| \[(\S+)\] -> (.+) \|\|\| (.*)\|\|\| (.*)/ || return 0;
+ my ($sentence,$source_rhs,$target_lhs,$target_rhs,$alignment,$source_spans) = ($1,$2,$3,$4,$5,$6);
+
+ ${$ref_sentence} = $sentence;
+
+ @{$ref_item->{'spans'}} = ();
+ foreach (split(/ /,$source_rhs)) {
+ /^\[?([^\]]+)\]?$/;
+ my %SPAN = ( 'word' => $1 );
+ push @{$ref_item->{'spans'}}, \%SPAN;
+ }
+
+ my $i = 0;
+ foreach my $span (split(/ /,$source_spans)) {
+ $span =~ /(\d+)\.\.(\d+)/ || die("funny span: $span\n");
+ $ref_item->{'spans'}[$i]{'from'} = $1;
+ $ref_item->{'spans'}[$i]{'to'} = $2;
+ if ($i == 0) {
+ $ref_item->{'start'} = $1;
+ }
+ $ref_item->{'end'} = $2;
+ $i++;
+ }
+
+ $ref_item->{'rule_lhs'} = $target_lhs;
+
+ $target_rhs =~ s/</&lt;/g;
+ $target_rhs =~ s/>/&gt;/g;
+ @{$ref_item->{'rule_rhs'}} = ();
+ foreach (split(/ /,$target_rhs)) {
+ /^\[?([^\]]+)\]?$/;
+ push @{$ref_item->{'rule_rhs'}}, $1;
+ }
+
+ foreach (split(/ /,$alignment)) {
+ /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
+ $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
+ $ref_item->{'alignedSpan'}{$1} = 1;
+ }
+ }
+
+ return 1;
+}
+
# process a single sentence for hierarchical segmentation
sub hs_process {
my ($sentence,$DERIVATION,$STATS) = @_;
diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl
index 9e4b35a77..722f02701 100755
--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@@ -19,7 +19,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
$factor = $1 if $feature_spec =~ / factor ([\d\-]+)/;
if ($SPEC[0] eq 'target-word-insertion') {
- $ini .= "TargetWordInsertionFeature factor=$factor";
+ $ini .= "TargetWordInsertionFeature name=TWI factor=$factor";
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($output_extension, $SPEC[2]);
@@ -33,7 +33,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
$ini .= "\n";
}
elsif ($SPEC[0] eq 'source-word-deletion') {
- $ini .= "SourceWordDeletionFeature factor=$factor";
+ $ini .= "SourceWordDeletionFeature name=SWD factor=$factor";
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($input_extension, $SPEC[2]);
$ini .= " path=$file";
@@ -59,10 +59,10 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
die("ERROR: Unknown parameter specification in '$SPEC[1]'\n");
}
my ($input_factor,$output_factor) = split(/\-/,$factor);
- $ini .= "WordTranslationFeature input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
+ $ini .= "WordTranslationFeature name=WT input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
}
elsif ($SPEC[0] eq 'phrase-length') {
- $ini .= "PhraseLengthFeature\n";
+ $ini .= "PhraseLengthFeature name=PL\n";
}
else {
die("ERROR: Unknown feature type '$SPEC[0]' in specification '$feature_spec'\nfull spec: '$specification'\n");
diff --git a/scripts/ems/support/defaultconfig.py b/scripts/ems/support/defaultconfig.py
index 5d5187c47..e88b63e3d 100644
--- a/scripts/ems/support/defaultconfig.py
+++ b/scripts/ems/support/defaultconfig.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
#
# Version of ConfigParser which accepts default values
diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl
index 6d6f3fdd1..8380f26ca 100755
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@@ -5,6 +5,7 @@ use IPC::Open3;
use File::Temp qw/tempdir/;
use File::Path qw/rmtree/;
use Getopt::Long "GetOptions";
+use Symbol;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
@@ -12,7 +13,7 @@ binmode(STDERR, ":utf8");
my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64";
my $TEMPDIR = "/tmp";
-my ($TUNING,$LM,$NAME,$GROUP,$CONTINUE);
+my ($TUNING,$LM,$NAME,$GROUP,$WEIGHTS,$CONTINUE);
die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]")
unless &GetOptions('tuning=s' => => \$TUNING,
@@ -21,6 +22,7 @@ die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--sril
'tempdir=s' => \$TEMPDIR,
'continue' => \$CONTINUE,
'group=s' => \$GROUP,
+ 'weights=s' => \$WEIGHTS,
'lm=s' => \$LM);
# check and set default to unset parameters
@@ -32,6 +34,10 @@ die("ERROR: did not find srilm dir") unless -e $SRILM;
die("ERROR: cannot run ngram") unless -x $SRILM."/ngram";
my @LM = split(/,/,$LM);
+my @WEIGHT;
+@WEIGHT = split(/,/,$WEIGHTS) if defined($WEIGHTS);
+die("ERROR: different number of weights and language models: ".scalar(@WEIGHT)." vs. ".scalar(@LM))
+ if defined($WEIGHTS) && scalar(@WEIGHT) != scalar(@LM);
# establish order
my $order = 0;
@@ -75,7 +81,7 @@ if (!defined($GROUP) && scalar(@LM) > 10) {
# normal interpolation
if (!defined($GROUP)) {
- &interpolate($NAME,@LM);
+ &interpolate($NAME,\@WEIGHT,@LM);
exit;
}
@@ -98,50 +104,58 @@ foreach my $subgroup (split(/ /,$GROUP)) {
my $name = $NAME.".group-".chr(97+($g++));
push @SUB_NAME,$name;
print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n";
- &interpolate($name, @SUB_LM) unless $CONTINUE && -e $name;
+ &interpolate($name, undef, @SUB_LM) unless $CONTINUE && -e $name;
}
for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) {
next if defined($ALREADY{$lm_i});
push @SUB_NAME, $LM[$lm_i];
}
print STDERR "\n=== BUILDING FINAL LM ===\n\n";
-&interpolate($NAME, @SUB_NAME);
+&interpolate($NAME, undef, @SUB_NAME);
# main interpolation function
sub interpolate {
- my ($name,@LM) = @_;
+ my ($name,$WEIGHT,@LM) = @_;
die("cannot interpolate more than 10 language models at once: ",join(",",@LM))
if scalar(@LM) > 10;
my $tmp = tempdir(DIR=>$TEMPDIR);
+ my @LAMBDA;
- # compute perplexity
- my $i = 0;
- foreach my $lm (@LM) {
- print STDERR "compute perplexity for $lm\n";
- safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
- print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
- $i++;
+ # if weights are specified, use them
+ if (defined($WEIGHT) && scalar(@$WEIGHT) == scalar(@LM)) {
+ @LAMBDA = @$WEIGHT;
}
+ # no specified weights -> compute them
+ else {
+ # compute perplexity
+ my $i = 0;
+ foreach my $lm (@LM) {
+ print STDERR "compute perplexity for $lm\n";
+ safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
+ print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
+ $i++;
+ }
- # compute lambdas
- print STDERR "computing lambdas...\n";
- my $cmd = "$SRILM/compute-best-mix";
- for(my $i=0;$i<scalar(@LM);$i++) {
- $cmd .= " $tmp/iplm.$$.$i";
+ # compute lambdas
+ print STDERR "computing lambdas...\n";
+ my $cmd = "$SRILM/compute-best-mix";
+ for(my $i=0;$i<scalar(@LM);$i++) {
+ $cmd .= " $tmp/iplm.$$.$i";
+ }
+ my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
+ die "Failed to mix models: $mixerr" if $mixexitcode != 0;
+ my $mix = $mixout;
+ `rm $tmp/iplm.$$.*`;
+ $mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix");
+ @LAMBDA = split(/ /,$1);
}
- my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
- die "Failed to mix models: $mixerr" if $mixexitcode != 0;
- my $mix = $mixout;
- `rm $tmp/iplm.$$.*`;
- $mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix");
- my @LAMBDA = split(/ /,$1);
-
+
# create new language model
print STDERR "creating new language model...\n";
- $i = 0;
- $cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
+ my $i = 0;
+ my $cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
foreach my $lm (@LM) {
$cmd .= " -lm " if $i==0;
$cmd .= " -mix-lm " if $i==1;
@@ -179,10 +193,11 @@ sub safesystem {
sub saferun3 {
print STDERR "Executing: @_\n";
- my($wtr, $rdr, $err);
+ my $wtr = gensym();
+ my $rdr = gensym();
+ my $err = gensym();
my $pid = open3($wtr, $rdr, $err, @_);
close($wtr);
- waitpid($pid, 0);
my $gotout = "";
$gotout .= $_ while (<$rdr>);
close $rdr;
@@ -191,6 +206,7 @@ sub saferun3 {
$goterr .= $_ while (<$err>);
close $err;
}
+ waitpid($pid, 0);
if ($? == -1) {
print STDERR "Failed to execute: @_\n $!\n";
exit(1);
diff --git a/scripts/ems/support/mml-filter.py b/scripts/ems/support/mml-filter.py
index 437c9dade..5fb43d71e 100755
--- a/scripts/ems/support/mml-filter.py
+++ b/scripts/ems/support/mml-filter.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
#
# Filter a parallel corpus
diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl
index 3efb243d7..be1509b8f 100755
--- a/scripts/ems/support/substitute-filtered-tables.perl
+++ b/scripts/ems/support/substitute-filtered-tables.perl
@@ -17,7 +17,7 @@ while(my $line = <FILTERED>) {
$feature_section = ($1 eq "feature");
}
next unless $feature_section;
- if ($line =~ /PhraseDictionary/) {
+ if ($line =~ /PhraseDictionary/ || $line =~ /RuleTable/) {
print STDERR "pt:$line \n";
push(@arr, $line);
}
@@ -36,7 +36,7 @@ while(my $line = <STDIN>) {
if ($line =~ /^\[(.+)\]/) {
$feature_section = ($1 eq "feature");
}
- if ($feature_section && $line =~ /PhraseDictionary/) {
+ if ($feature_section && ($line =~ /PhraseDictionary/ || $line =~ /RuleTable/)) {
print $arr[$ind]."\n";
++$ind;
}
diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl
new file mode 100755
index 000000000..e6f7839f1
--- /dev/null
+++ b/scripts/ems/support/thot-lm-wrapper.perl
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+
+my ($TEXT,$ORDER,$BIN,$LM,$TMP);
+
+&GetOptions('text=s' => \$TEXT,
+ 'lm=s' => \$LM,
+ 'tmp=s' => \$TMP,
+ 'bin=s' => \$BIN,
+ 'order=i' => \$ORDER);
+
+die("ERROR: specify --text CORPUS --lm LM --order N --bin THOT_BINARY !")
+ unless defined($TEXT) && defined($LM) && defined($ORDER) && defined($BIN);
+
+my $cmd = "$BIN -c $TEXT -n $ORDER -o $LM -unk -sdir $TMP -tdir $TMP";
+
+print "exec: $cmd\n";
+`$cmd`;
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index b663dcfe8..433e95b9d 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -29,6 +29,8 @@ my $otherExtractArgs= "";
my $weights = "";
my $baselineExtract;
my $glueFile;
+my $phraseOrientation = 0;
+my $phraseOrientationPriorsFile;
for (my $i = 8; $i < $#ARGV + 1; ++$i)
{
@@ -45,6 +47,11 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i)
$glueFile = $ARGV[++$i];
next;
}
+ $phraseOrientation = 1 if $ARGV[$i] eq "--PhraseOrientation";
+ if ($ARGV[$i] eq '--PhraseOrientationPriors') {
+ $phraseOrientationPriorsFile = $ARGV[++$i];
+ next;
+ }
$otherExtractArgs .= $ARGV[$i] ." ";
}
@@ -64,20 +71,20 @@ my $pid;
if ($numParallel > 1)
{
- $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $target $TMPDIR/target.";
+ $cmd = "$splitCmd -d -l $linesPerSplit -a 7 $target $TMPDIR/target.";
$pid = RunFork($cmd);
push(@children, $pid);
- $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $source $TMPDIR/source.";
+ $cmd = "$splitCmd -d -l $linesPerSplit -a 7 $source $TMPDIR/source.";
$pid = RunFork($cmd);
push(@children, $pid);
- $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $align $TMPDIR/align.";
+ $cmd = "$splitCmd -d -l $linesPerSplit -a 7 $align $TMPDIR/align.";
$pid = RunFork($cmd);
push(@children, $pid);
if ($weights) {
- $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $weights $TMPDIR/weights.";
+ $cmd = "$splitCmd -d -l $linesPerSplit -a 7 $weights $TMPDIR/weights.";
$pid = RunFork($cmd);
push(@children, $pid);
}
@@ -212,13 +219,39 @@ foreach (@children) {
waitpid($_, 0);
}
-# glue rules
+# merge glue rules
if (defined($glueFile)) {
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
print STDERR "Merging glue rules: $cmd \n";
print STDERR `$cmd`;
}
+# merge phrase orientation priors (GHKM extraction)
+if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
+ print STDERR "Merging phrase orientation priors\n";
+
+ my @orientationPriorsCountFiles = glob("$TMPDIR/*.phraseOrientationPriors");
+ my %priorCounts;
+
+ foreach my $filenamePhraseOrientationPriors (@orientationPriorsCountFiles) {
+ if (-f $filenamePhraseOrientationPriors) {
+ open my $infilePhraseOrientationPriors, '<', $filenamePhraseOrientationPriors or die "cannot open $filenamePhraseOrientationPriors: $!";
+ while (my $line = <$infilePhraseOrientationPriors>) {
+ print $line;
+ my ($key, $value) = split / /, $line;
+ $priorCounts{$key} += $value;
+ }
+ close $infilePhraseOrientationPriors;
+ }
+ }
+
+ open my $outPhraseOrientationPriors, '>', $phraseOrientationPriorsFile or die "cannot open $phraseOrientationPriorsFile: $!";
+ foreach my $key (sort keys %priorCounts) {
+ print $outPhraseOrientationPriors $key." ".$priorCounts{$key}."\n";
+ }
+ close($outPhraseOrientationPriors);
+}
+
# delete temporary files
$cmd = "rm -rf $TMPDIR \n";
print STDERR $cmd;
@@ -259,15 +292,21 @@ sub NumStr($)
my $i = shift;
my $numStr;
if ($i < 10) {
- $numStr = "0000$i";
+ $numStr = "000000$i";
}
elsif ($i < 100) {
- $numStr = "000$i";
+ $numStr = "00000$i";
}
elsif ($i < 1000) {
- $numStr = "00$i";
+ $numStr = "0000$i";
}
elsif ($i < 10000) {
+ $numStr = "000$i";
+ }
+ elsif ($i < 100000) {
+ $numStr = "00$i";
+ }
+ elsif ($i < 1000000) {
$numStr = "0$i";
}
else {
diff --git a/scripts/generic/fsa2fsal.pl b/scripts/generic/fsa2fsal.pl
new file mode 100755
index 000000000..d13c87310
--- /dev/null
+++ b/scripts/generic/fsa2fsal.pl
@@ -0,0 +1,49 @@
+#!/usr/bin/env perl
+# A very simple script that converts fsa format (openfst lattices) to the same
+# thing represented one sentence per line. It uses '|||' to delimit columns and
+# ' ' to delimit nodes (i.e. original lines).
+# Some rudimentary sanity checks are done on the fly.
+# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+
+use strict;
+
+my $errs = 0;
+sub err {
+ my $nr = shift;
+ my $msg = shift;
+ print STDERR "$nr:$msg\n";
+ $errs++;
+}
+
+my $onr = 0;
+my @lines = ();
+sub flush {
+ return if 0 == scalar @lines;
+ print join(" ", @lines);
+ print "\n";
+ $onr++;
+ @lines = ();
+}
+
+my $nr = 0;
+my $numscores = undef;
+while (<>) {
+ chomp;
+ if ($_ eq "") {
+ flush();
+ next;
+ }
+ my ($a, $b, $label, $scores, $rest) = split /\s+/, $_, 5;
+ err($nr, "The delimiter '|||' can't appear in the input!") if /\|\|\|/;
+ err($nr, "Node id not numeric: $a") if $a !~ /^\d+$/;
+ err($nr, "Node id not numeric: $b") if $b !~ /^\d+$/;
+ err($nr, "Unexpected tail: '$rest'") if defined $rest && $rest !~ /^\s*$/;
+ my $thisnumscores = ($scores =~ tr/,/,/);
+ $numscores = $thisnumscores if !defined $numscores;
+ err($nr, "Incompatible number of arc scores, previous lines had ".($numscores+1).", now ".($thisnumscores+1))
+ if $numscores != $thisnumscores;
+ push @lines, join("|||", ($a,$b,$label,$scores));
+}
+flush();
+
+exit 1 if $errs;
diff --git a/scripts/generic/fsal2fsa.pl b/scripts/generic/fsal2fsa.pl
new file mode 100755
index 000000000..36aed0ecd
--- /dev/null
+++ b/scripts/generic/fsal2fsa.pl
@@ -0,0 +1,15 @@
+#!/usr/bin/env perl
+# A very simple script that converts fsal back to fsa format (openfst lattices)
+# Ondrej Bojar, bojar@ufal.mff.cuni.cz
+
+use strict;
+
+while (<>) {
+ chomp;
+ tr/ /\n/;
+ s/\|\|\|/\t/g;
+ print;
+ print "\n";
+ print "\n";
+}
+
diff --git a/scripts/generic/generic-parallel.perl b/scripts/generic/generic-parallel.perl
index 594fbcf5d..2becba31c 100755
--- a/scripts/generic/generic-parallel.perl
+++ b/scripts/generic/generic-parallel.perl
@@ -90,19 +90,25 @@ sub NumStr($)
my $i = shift;
my $numStr;
if ($i < 10) {
- $numStr = "0000$i";
+ $numStr = "000000$i";
}
elsif ($i < 100) {
- $numStr = "000$i";
+ $numStr = "00000$i";
}
elsif ($i < 1000) {
- $numStr = "00$i";
+ $numStr = "0000$i";
}
elsif ($i < 10000) {
- $numStr = "0$i";
+ $numStr = "000$i";
+ }
+ elsif ($i < 100000) {
+ $numStr = "00$i";
+ }
+ elsif ($i < 1000000) {
+ $numStr = "0$i";
}
else {
- $numStr = $i;
+ $numStr = $i;
}
return $numStr;
}
diff --git a/scripts/generic/giza-parallel.perl b/scripts/generic/giza-parallel.perl
index 60059b46a..55192af74 100755
--- a/scripts/generic/giza-parallel.perl
+++ b/scripts/generic/giza-parallel.perl
@@ -102,23 +102,29 @@ print $cmd;
sub NumStr($)
{
- my $i = shift;
- my $numStr;
- if ($i < 10) {
- $numStr = "0000$i";
- }
- elsif ($i < 100) {
- $numStr = "000$i";
- }
- elsif ($i < 1000) {
- $numStr = "00$i";
- }
- elsif ($i < 10000) {
- $numStr = "0$i";
- }
- else {
- $numStr = $i;
- }
- return $numStr;
+ my $i = shift;
+ my $numStr;
+ if ($i < 10) {
+ $numStr = "000000$i";
+ }
+ elsif ($i < 100) {
+ $numStr = "00000$i";
+ }
+ elsif ($i < 1000) {
+ $numStr = "0000$i";
+ }
+ elsif ($i < 10000) {
+ $numStr = "000$i";
+ }
+ elsif ($i < 100000) {
+ $numStr = "00$i";
+ }
+ elsif ($i < 1000000) {
+ $numStr = "0$i";
+ }
+ else {
+ $numStr = $i;
+ }
+ return $numStr;
}
diff --git a/scripts/generic/moses_sim_pe.py b/scripts/generic/moses_sim_pe.py
new file mode 100755
index 000000000..f4d640cfa
--- /dev/null
+++ b/scripts/generic/moses_sim_pe.py
@@ -0,0 +1,373 @@
+#!/usr/bin/env python
+
+# Written by Michael Denkowski
+
+# This script parallelizes decoding with simulated post-editing via moses XML
+# input (XML entities need to be escaped in tokenization). Memory mapped
+# dynamic phrase tables (Ulrich Germann,
+# www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models
+# (Kenneth Heafield,
+# http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19)
+# facilitate memory efficient multi process decoding. Input is divided into
+# batches, each of which is decoded sequentially. Each batch pre-loads the data
+# from previous batches.
+
+# To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the
+# alignment from input to references. Specify the number of jobs with
+# --decoder-flags="-threads N".
+
+import gzip
+import itertools
+import math
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+
+HELP = '''Moses with simulated post-editing
+
+Usage: {} moses-cmd -config moses.ini -input-file text.src -ref text.tgt -symal text.src-tgt.symal [options] [decoder flags]
+
+Options:
+ -threads N: number of decoders to run in parallel (default read from moses.ini, 1 if not present)
+ -n-best-list nbest.out N [distinct]: location and size of N-best list
+ -show-weights: for mert-moses.pl, just call moses and exit
+ -tmp: location of temp directory (default /tmp)
+
+Other options (decoder flags) are passed through to moses-cmd\n'''
+
+# Provides progress bar
+class Progress:
+
+ def __init__(self):
+ self.i = 0
+ self.lock = threading.Lock()
+
+ def inc(self):
+ self.lock.acquire()
+ self.i += 1
+ if self.i % 100 == 0:
+ sys.stderr.write('.')
+ if self.i % 1000 == 0:
+ sys.stderr.write(' [{}]\n'.format(self.i))
+ sys.stderr.flush()
+ self.lock.release()
+
+ def done(self):
+ self.lock.acquire()
+ if self.i % 1000 != 0:
+ sys.stderr.write('\n')
+ self.lock.release()
+
+# Run with atomic (synchronous) I/O
+def atomic_io(cmd, in_file, out_file, err_file, prog=None):
+ with open(in_file, 'r') as inp, open(out_file, 'w') as out, open(err_file, 'w') as err:
+ p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=err)
+ while True:
+ line = inp.readline()
+ if not line:
+ break
+ p.stdin.write(line)
+ out.write(p.stdout.readline())
+ out.flush()
+ if prog:
+ prog.inc()
+ p.stdin.close()
+ p.wait()
+
+# Open plain or gzipped text
+def gzopen(f):
+ return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r')
+
+# Word count
+def wc(f):
+ i = 0
+ for line in gzopen(f):
+ i += 1
+ return i
+
+# Write lines to gzipped file
+def write_gzfile(lines, f):
+ out = gzip.open(f, 'wb')
+ for line in lines:
+ out.write('{}\n'.format(line))
+ out.close()
+
+def main(argv):
+
+ # Defaults
+ moses_ini = None
+ moses_ini_lines = None
+ text_src = None
+ text_tgt = None
+ text_symal = None
+ text_len = None
+ threads_found = False
+ threads = 1
+ n_best_out = None
+ n_best_size = None
+ n_best_distinct = False
+ hg_ext = None
+ hg_dir = None
+ tmp_dir = '/tmp'
+ xml_found = False
+ xml_input = 'exclusive'
+ show_weights = False
+ mmsapt_dynamic = []
+ mmsapt_static = []
+ mmsapt_l1 = None
+ mmsapt_l2 = None
+
+ # Decoder command
+ cmd = argv[1:]
+
+ # Parse special options and remove from cmd
+ i = 1
+ while i < len(cmd):
+ if cmd[i] in ('-f', '-config'):
+ moses_ini = cmd[i + 1]
+ cmd = cmd[:i] + cmd[i + 2:]
+ elif cmd[i] in ('-i', '-input-file'):
+ text_src = cmd[i + 1]
+ cmd = cmd[:i] + cmd[i + 2:]
+ elif cmd[i] == '-ref':
+ text_tgt = cmd[i + 1]
+ cmd = cmd[:i] + cmd[i + 2:]
+ elif cmd[i] == '-symal':
+ text_symal = cmd[i + 1]
+ cmd = cmd[:i] + cmd[i + 2:]
+ elif cmd[i] in ('-th', '-threads'):
+ threads_found = True
+ threads = int(cmd[i + 1])
+ cmd = cmd[:i] + cmd[i + 2:]
+ elif cmd[i] == '-n-best-list':
+ n_best_out = cmd[i + 1]
+ n_best_size = cmd[i + 2]
+ # Optional "distinct"
+ if i + 3 < len(cmd) and cmd[i + 3] == 'distinct':
+ n_best_distinct = True
+ cmd = cmd[:i] + cmd[i + 4:]
+ else:
+ cmd = cmd[:i] + cmd[i + 3:]
+ elif cmd[i] == '-output-search-graph-hypergraph':
+ # cmd[i + 1] == true
+ hg_ext = cmd[i + 2]
+ if i + 3 < len(cmd) and cmd[i + 3][0] != '-':
+ hg_dir = cmd[i + 3]
+ cmd = cmd[:i] + cmd[i + 4:]
+ else:
+ hg_dir = 'hypergraph'
+ cmd = cmd[:i] + cmd[i + 3:]
+ elif cmd[i] == '-tmp':
+ tmp_dir = cmd[i + 1]
+ cmd = cmd[:i] + cmd[i + 2:]
+ # Handled specially to make sure XML input is turned on somewhere
+ elif cmd[i] in ('-xi', '-xml-input'):
+ xml_found = True
+ xml_input = cmd[i + 1]
+ cmd = cmd[:i] + cmd[i + 2:]
+ # Handled specially for mert-moses.pl
+ elif cmd[i] == '-show-weights':
+ show_weights = True
+ # Do not remove from cmd
+ i += 1
+ else:
+ i += 1
+
+ # Read moses.ini
+ if moses_ini:
+ moses_ini_lines = [line.strip() for line in open(moses_ini, 'r')]
+ i = 0
+ while i < len(moses_ini_lines):
+ # PhraseDictionaryBitextSampling name=TranslationModel0 output-factor=0 num-features=7 path=corpus. L1=src L2=tgt pfwd=g pbwd=g smooth=0 sample=1000 workers=1
+ if moses_ini_lines[i].startswith('PhraseDictionaryBitextSampling'):
+ for (k, v) in (pair.split('=') for pair in moses_ini_lines[i].split()[1:]):
+ if k == 'name':
+ # Dynamic means update this model
+ if v.startswith('Dynamic'):
+ mmsapt_dynamic.append(v)
+ moses_ini_lines[i] += '{mmsapt_extra}'
+ else:
+ mmsapt_static.append(v)
+ elif k == 'L1':
+ if mmsapt_l1 and v != mmsapt_l1:
+ sys.stderr.write('Error: All PhraseDictionaryBitextSampling entries should have same L1: {} != {}\n'.format(v, mmsapt_l1))
+ sys.exit(1)
+ mmsapt_l1 = v
+ elif k == 'L2':
+ if mmsapt_l2 and v != mmsapt_l2:
+ sys.stderr.write('Error: All PhraseDictionaryBitextSampling entries should have same L2: {} != {}\n'.format(v, mmsapt_l2))
+ sys.exit(1)
+ mmsapt_l2 = v
+ # [threads]
+ # 8
+ elif moses_ini_lines[i] == '[threads]':
+ # Prefer command line over moses.ini
+ if not threads_found:
+ threads = int(moses_ini_lines[i + 1])
+ i += 1
+ # [xml-input]
+ # exclusive
+ elif moses_ini_lines[i] == '[xml-input]':
+ # Prefer command line over moses.ini
+ if not xml_found:
+ xml_found = True
+ xml_input = moses_ini_lines[i + 1]
+ i += 1
+ i += 1
+
+ # If mert-moses.pl passes -show-weights, just call moses
+ if show_weights:
+ # re-append original moses.ini
+ cmd.append('-config')
+ cmd.append(moses_ini)
+ sys.stdout.write(subprocess.check_output(cmd))
+ sys.stdout.flush()
+ sys.exit(0)
+
+ # Input length
+ if text_src:
+ text_len = wc(text_src)
+
+ # Check inputs
+ if not (len(cmd) > 0 and all((moses_ini, text_src, text_tgt, text_symal))):
+ sys.stderr.write(HELP.format(argv[0]))
+ sys.exit(2)
+ if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
+ sys.stderr.write('Error: moses-cmd "{}" is not executable\n'.format(cmd[0]))
+ sys.exit(1)
+ if not mmsapt_dynamic:
+ sys.stderr.write('Error: no PhraseDictionaryBitextSampling entries named "Dynamic..." found in {}. See http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40\n'.format(moses_ini))
+ sys.exit(1)
+ if wc(text_tgt) != text_len or wc(text_symal) != text_len:
+ sys.stderr.write('Error: length mismatch between "{}", "{}", and "{}"\n'.format(text_src, text_tgt, text_symal))
+ sys.exit(1)
+
+ # Setup
+ work_dir = tempfile.mkdtemp(prefix='moses.', dir=os.path.abspath(tmp_dir))
+ threads = min(threads, text_len)
+ batch_size = int(math.ceil(float(text_len) / threads))
+
+ # Report settings
+ sys.stderr.write('Moses flags: {}\n'.format(' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
+ for (i, n) in enumerate(mmsapt_dynamic):
+ sys.stderr.write('Dynamic mmsapt {}: {} {} {}\n'.format(i, n, mmsapt_l1, mmsapt_l2))
+ for (i, n) in enumerate(mmsapt_static):
+ sys.stderr.write('Static mmsapt {}: {} {} {}\n'.format(i, n, mmsapt_l1, mmsapt_l2))
+ sys.stderr.write('XML mode: {}\n'.format(xml_input))
+ sys.stderr.write('Inputs: {} {} {} ({})\n'.format(text_src, text_tgt, text_symal, text_len))
+ sys.stderr.write('Jobs: {}\n'.format(threads))
+ sys.stderr.write('Batch size: {}\n'.format(batch_size))
+ if n_best_out:
+ sys.stderr.write('N-best list: {} ({}{})\n'.format(n_best_out, n_best_size, ', distinct' if n_best_distinct else ''))
+ if hg_dir:
+ sys.stderr.write('Hypergraph dir: {} ({})\n'.format(hg_dir, hg_ext))
+ sys.stderr.write('Temp dir: {}\n'.format(work_dir))
+
+ # Accumulate seen lines
+ src_lines = []
+ tgt_lines = []
+ symal_lines = []
+
+ # Current XML source file
+ xml_out = None
+
+ # Split into batches. Each batch after 0 gets extra files with data from previous batches.
+ # Data from previous lines in the current batch is added using XML input.
+ job = -1
+ lc = -1
+ for (src, tgt, symal) in itertools.izip(gzopen(text_src), gzopen(text_tgt), gzopen(text_symal)):
+ (src, tgt, symal) = (src.strip(), tgt.strip(), symal.strip())
+ lc += 1
+ if lc % batch_size == 0:
+ job += 1
+ xml_file = os.path.join(work_dir, 'input.{}.xml'.format(job))
+ extra_src_file = os.path.join(work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l1))
+ extra_tgt_file = os.path.join(work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l2))
+ extra_symal_file = os.path.join(work_dir, 'extra.{}.{}-{}.symal.gz'.format(job, mmsapt_l1, mmsapt_l2))
+ if job > 0:
+ xml_out.close()
+ write_gzfile(src_lines, extra_src_file)
+ write_gzfile(tgt_lines, extra_tgt_file)
+ write_gzfile(symal_lines, extra_symal_file)
+ xml_out = open(xml_file, 'w')
+ with open(os.path.join(work_dir, 'moses.{}.ini'.format(job)), 'w') as moses_ini_out:
+ extra = '' if job == 0 else ' extra={}'.format(os.path.join(work_dir, 'extra.{}.'.format(job)))
+ moses_ini_out.write('{}\n'.format('\n'.join(moses_ini_lines).format(mmsapt_extra=extra)))
+ src_lines.append(src)
+ tgt_lines.append(tgt)
+ symal_lines.append(symal)
+ # Lines after first start with update tag including previous translation.
+ # Translation of last line of each batch is included in extra for next batch.
+ xml_tags = []
+ if lc % batch_size != 0:
+ for n in mmsapt_dynamic:
+ # note: space after tag
+ xml_tags.append('<update name="{}" source="{}" target="{}" alignment="{}" /> '.format(n, src_lines[-2], tgt_lines[-2], symal_lines[-2]))
+ xml_out.write('{}{}\n'.format(''.join(xml_tags), src))
+ xml_out.close()
+
+ # Run decoders in parallel
+ workers = []
+ prog = Progress()
+ for i in range(threads):
+ work_cmd = cmd[:]
+ work_cmd.append('-config')
+ work_cmd.append(os.path.join(work_dir, 'moses.{}.ini'.format(i)))
+ # Workers use 1 CPU each
+ work_cmd.append('-threads')
+ work_cmd.append('1')
+ if not xml_found:
+ work_cmd.append('-xml-input')
+ work_cmd.append(xml_input)
+ if n_best_out:
+ work_cmd.append('-n-best-list')
+ work_cmd.append(os.path.join(work_dir, 'nbest.{}'.format(i)))
+ work_cmd.append(str(n_best_size))
+ if n_best_distinct:
+ work_cmd.append('distinct')
+ if hg_dir:
+ work_cmd.append('-output-search-graph-hypergraph')
+ work_cmd.append('true')
+ work_cmd.append(hg_ext)
+ work_cmd.append(os.path.join(work_dir, 'hg.{}'.format(i)))
+ in_file = os.path.join(work_dir, 'input.{}.xml'.format(i))
+ out_file = os.path.join(work_dir, 'out.{}'.format(i))
+ err_file = os.path.join(work_dir, 'err.{}'.format(i))
+ t = threading.Thread(target=atomic_io, args=(work_cmd, in_file, out_file, err_file, prog))
+ workers.append(t)
+ t.start()
+ # Wait for all to finish
+ for t in workers:
+ t.join()
+ prog.done()
+
+ # Gather N-best lists
+ if n_best_out:
+ with open(n_best_out, 'w') as out:
+ for i in range(threads):
+ for line in open(os.path.join(work_dir, 'nbest.{}'.format(i)), 'r'):
+ entry = line.partition(' ')
+ out.write('{} {}'.format(int(entry[0]) + (i * batch_size), entry[2]))
+
+ # Gather hypergraphs
+ if hg_dir:
+ if not os.path.exists(hg_dir):
+ os.mkdir(hg_dir)
+ shutil.copy(os.path.join(work_dir, 'hg.0', 'weights'), os.path.join(hg_dir, 'weights'))
+ for i in range(threads):
+ for j in range(batch_size):
+ shutil.copy(os.path.join(work_dir, 'hg.{}'.format(i), '{}.{}'.format(j, hg_ext)), os.path.join(hg_dir, '{}.{}'.format((i * batch_size) + j, hg_ext)))
+
+ # Gather stdout
+ for i in range(threads):
+ for line in open(os.path.join(work_dir, 'out.{}'.format(i)), 'r'):
+ sys.stdout.write(line)
+
+ # Cleanup
+ shutil.rmtree(work_dir)
+
+if __name__ == '__main__':
+ main(sys.argv)
diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl
index da37b1353..7835d3826 100755
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@@ -27,10 +27,22 @@ my $scoreCmd = $ARGV[2];
my $extractFile = $ARGV[3]; # 1st arg of extract argument
my $lexFile = $ARGV[4];
my $ptHalf = $ARGV[5]; # output
+my $inverse = 0;
+my $sourceLabelsFile;
my $otherExtractArgs= "";
for (my $i = 6; $i < $#ARGV; ++$i)
{
+ if ($ARGV[$i] eq '--SourceLabels') {
+ $sourceLabelsFile = $ARGV[++$i];
+ $otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet ";
+ next;
+ }
+ if ($ARGV[$i] eq '--Inverse') {
+ $inverse = 1;
+ $otherExtractArgs .= $ARGV[$i] ." ";
+ next;
+ }
$otherExtractArgs .= $ARGV[$i] ." ";
}
#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
@@ -258,6 +270,14 @@ if (-e $cocPath)
close(FHCOC);
}
+# merge source label files
+if (!$inverse && defined($sourceLabelsFile))
+{
+ my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $sourceLabelsFile";
+ print STDERR "Merging source label files: $cmd \n";
+ `$cmd`;
+}
+
$cmd = "rm -rf $TMPDIR \n";
print STDERR $cmd;
systemCheck($cmd);
@@ -305,15 +325,21 @@ sub NumStr($)
my $i = shift;
my $numStr;
if ($i < 10) {
- $numStr = "0000$i";
+ $numStr = "000000$i";
}
elsif ($i < 100) {
- $numStr = "000$i";
+ $numStr = "00000$i";
}
elsif ($i < 1000) {
- $numStr = "00$i";
+ $numStr = "0000$i";
}
elsif ($i < 10000) {
+ $numStr = "000$i";
+ }
+ elsif ($i < 100000) {
+ $numStr = "00$i";
+ }
+ elsif ($i < 1000000) {
$numStr = "0$i";
}
else {
diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl
index 8b7c3a7e1..90d4329c1 100755
--- a/scripts/other/beautify.perl
+++ b/scripts/other/beautify.perl
@@ -40,6 +40,8 @@ sub Beautify($)
next if ($name eq "srilm");
next if ($name eq "irstlm");
next if ($name eq "UG");
+ next if ($name eq "pcfg-common");
+ next if ($name eq "syntax-common");
$name = $path ."/" .$name;
if (-d $name) {
diff --git a/scripts/other/blame-stat.sh b/scripts/other/blame-stat.sh
new file mode 100755
index 000000000..7ceddfc5d
--- /dev/null
+++ b/scripts/other/blame-stat.sh
@@ -0,0 +1,4 @@
+git ls-files | xargs -n1 git blame --line-porcelain | sed -n 's/^author //p' | sort -f | uniq -ic | sort -nr
+
+#git ls-files | grep -Ei "\.h$|\.cpp$|\.hh$|\.cc$" | xargs -n1 git blame --line-porcelain | sed -n 's/^author //p' | sort -f | uniq -ic | sort -nr
+
diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl
index 442173026..2a4f51c89 100755
--- a/scripts/other/delete-scores.perl
+++ b/scripts/other/delete-scores.perl
@@ -59,3 +59,5 @@ sub DeleteScore
return $string;
}
+
+
diff --git a/scripts/other/retain-lines.perl b/scripts/other/retain-lines.perl
new file mode 100755
index 000000000..6f7c517c2
--- /dev/null
+++ b/scripts/other/retain-lines.perl
@@ -0,0 +1,31 @@
+#!/usr/bin/perl
+
+#retain lines in clean.lines-retained.1
+use strict;
+use warnings;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+my $retainPath = $ARGV[0];
+
+open(LINE_RETAINED, $retainPath);
+my $retainLine = <LINE_RETAINED>;
+
+my $lineNum = 0;
+while (my $line = <STDIN>) {
+ chomp($line);
+ ++$lineNum;
+
+ if ($retainLine == $lineNum) {
+ print "$line\n";
+ if ($retainLine = <LINE_RETAINED>) {
+ # do nothing
+ }
+ else {
+ # retained lines is finished.
+ $retainLine = 0;
+ }
+ }
+}
diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl
index fa833dbd6..b12aa6147 100755
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@@ -118,7 +118,7 @@ sub train_lm {
}
else {
$LM = "KENLM";
- $cmd = "$BUILD_KENLM --prune 0 0 1 -S 50% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.gz";
+ $cmd = "$BUILD_KENLM --prune 0 0 1 -S 5% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.gz";
}
print STDERR "** Using $LM **" . "\n";
print STDERR $cmd."\n";
diff --git a/scripts/server/moses.py b/scripts/server/moses.py
index 32c53fa2a..a176c473a 100644
--- a/scripts/server/moses.py
+++ b/scripts/server/moses.py
@@ -31,7 +31,7 @@ class ProcessWrapper:
def start(self, stdin=PIPE, stdout=PIPE):
if self.process:
raise Exception("Process is already running")
- self.process = Popen(cmd, stdin = stdin, stdout = stdout)
+ self.process = Popen(self.cmd, stdin = stdin, stdout = stdout)
return
def __del__(self):
@@ -57,6 +57,7 @@ class SentenceSplitter(ProcessWrapper):
def __init__(self,lang):
ssplit_cmd = moses_root+"/scripts/ems/support/split-sentences.perl"
self.cmd = [ssplit_cmd, "-b", "-q", "-l",lang]
+ self.process = None
return
def __call__(self,input):
@@ -91,15 +92,17 @@ class Tokenizer(LineProcessor):
def __init__(self,lang,args=["-a","-no-escape"]):
tok_cmd = moses_root+"/scripts/tokenizer/tokenizer.perl"
self.cmd = [tok_cmd,"-b", "-q", "-l", lang] + args
+ self.process = None
return
-class TrueCaser(LineProcessor):
+class Truecaser(LineProcessor):
"""
Truecaser wrapper.
"""
def __init__(self,model):
- trucase_cmd = moses_root+"/scripts/recaser/truecase.perl"
+ truecase_cmd = moses_root+"/scripts/recaser/truecase.perl"
self.cmd = [truecase_cmd,"-b", "--model",model]
+ self.process = None
return
pass
@@ -149,7 +152,7 @@ def find_free_port(p):
class MosesServer(ProcessWrapper):
- def __init__(self,args=["-fd", "\n"]):
+ def __init__(self,args=[]):
self.process = None
mserver_cmd = moses_root+"/bin/mosesserver"
self.cmd = [mserver_cmd] + args
@@ -172,7 +175,10 @@ class MosesServer(ProcessWrapper):
self.cmd.extend(["--server-port", "%d"%self.port])
if debug:
print >>sys.stderr,self.cmd
- self.process = Popen(self.cmd,stderr = sys.stderr)
+ # self.stderr = open("mserver.%d.stderr"%self.port,'w')
+ # self.stdout = open("mserver.%d.stdout"%self.port,'w')
+ # self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout)
+ self.process = Popen(self.cmd)
else:
devnull = open(os.devnull,"w")
self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
@@ -213,10 +219,13 @@ class MosesServer(ProcessWrapper):
elif type(input) is list:
return [self.translate(x) for x in input]
+
elif type(input) is dict:
return self.proxy.translate(input)
+
else:
raise Exception("Can't handle input of this type!")
+
except:
attempts += 1
print >>sys.stderr, "WAITING", attempts
diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py
index 340695a56..52d1e314a 100755
--- a/scripts/server/sim-pe.py
+++ b/scripts/server/sim-pe.py
@@ -127,13 +127,40 @@ def translate(proxy, args, line):
param['nbest-distinct'] = True
pass
attempts = 0
- while attempts < 120:
+ while attempts < 20:
+ t1 = time.time()
try:
- return proxy.translate(param)
- except:
- print >>sys.stderr, "Waiting", proxy
- attempts += 1
+ return proxy.translate(param)
+
+ # except xmlrpclib.Fault as e:
+ # except xmlrpclib.ProtocolError as e:
+ # except xmlrpclib.ResponseError as e:
+ except xmlrpclib.Error as e:
+ time.sleep(2) # give all the stderr stuff a chance to be flushed
+ print >>sys.stderr," XMLRPC error:",e
+ print >>sys.stderr, "Input was"
+ print >>sys.stderr, param
+ sys.exit(1)
+
+ except IOError as e:
+ print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror)
time.sleep(5)
+
+ except:
+ serverstatus = mserver.process.poll()
+ if serverstatus == None:
+ print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1)
+ attempts += 1
+ if attempts > 10:
+ time.sleep(10)
+ else:
+ time.sleep(5)
+ pass
+ else:
+
+ print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\
+ %(serverstatus/256,serverstatus%256)
+ pass
pass
pass
raise Exception("Exception: could not reach translation server.")
@@ -210,17 +237,25 @@ if __name__ == "__main__":
pass
pass
- if args.url:
- mserver.connect(args.url)
- else:
- mserver.start(args=mo_args,port=args.port,debug=args.debug)
- pass
-
ref = None
aln = None
if args.ref: ref = read_data(args.ref)
if args.aln: aln = read_data(args.aln)
+ if ref and aln:
+ try:
+ mo_args.index("--serial")
+ except:
+ mo_args.append("--serial")
+ pass
+ pass
+
+ if args.url:
+ mserver.connect(args.url)
+ else:
+ mserver.start(args=mo_args, port=args.port, debug=args.debug)
+ pass
+
if (args.input == "-"):
line = sys.stdin.readline()
idx = 0
diff --git a/scripts/share/nonbreaking_prefixes/README.txt b/scripts/share/nonbreaking_prefixes/README.txt
index 02cdfccb9..2276a1138 100644
--- a/scripts/share/nonbreaking_prefixes/README.txt
+++ b/scripts/share/nonbreaking_prefixes/README.txt
@@ -2,4 +2,7 @@ The language suffix can be found here:
http://www.loc.gov/standards/iso639-2/php/code_list.php
+This code includes data from Daniel Naber's Language Tools (czech abbreviations).
+This code includes data from czech wiktionary (also czech abbreviations).
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en
index e1a3733b5..43770db14 100644
--- a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en
@@ -105,3 +105,17 @@ Nos
Art #NUMERIC_ONLY#
Nr
pp #NUMERIC_ONLY#
+
+#month abbreviations
+Jan
+Feb
+Mar
+Apr
+May
+Jun
+Jul
+Aug
+Sep
+Oct
+Nov
+Dec
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fi b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fi
new file mode 100644
index 000000000..466c6a837
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.fi
@@ -0,0 +1,138 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT
+#indicate an end-of-sentence marker. Special cases are included for prefixes
+#that ONLY appear before 0-9 numbers.
+
+#This list is compiled from omorfi <http://code.google.com/p/omorfi> database
+#by Tommi A Pirinen.
+
+
+#any single upper case letter followed by a period is not a sentence ender
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Ã…
+Ä
+Ö
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+alik
+alil
+amir
+apul
+apul.prof
+arkkit
+ass
+assist
+dipl
+dipl.arkkit
+dipl.ekon
+dipl.ins
+dipl.kielenk
+dipl.kirjeenv
+dipl.kosm
+dipl.urk
+dos
+erikoiseläinl
+erikoishammasl
+erikoisl
+erikoist
+ev.luutn
+evp
+fil
+ft
+hallinton
+hallintot
+hammaslääket
+jatk
+jääk
+kansaned
+kapt
+kapt.luutn
+kenr
+kenr.luutn
+kenr.maj
+kers
+kirjeenv
+kom
+kom.kapt
+komm
+konst
+korpr
+luutn
+maist
+maj
+Mr
+Mrs
+Ms
+M.Sc
+neuv
+nimim
+Ph.D
+prof
+puh.joht
+pääll
+res
+san
+siht
+suom
+sähköp
+säv
+toht
+toim
+toim.apul
+toim.joht
+toim.siht
+tuom
+ups
+vänr
+vääp
+ye.ups
+ylik
+ylil
+ylim
+ylimatr
+yliop
+yliopp
+ylip
+yliv
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
+#into this category - it sometimes ends a sentence)
+e.g
+ent
+esim
+huom
+i.e
+ilm
+l
+mm
+myöh
+nk
+nyk
+par
+po
+t
+v
diff --git a/scripts/tokenizer/basic-protected-patterns b/scripts/tokenizer/basic-protected-patterns
new file mode 100644
index 000000000..57a0dd485
--- /dev/null
+++ b/scripts/tokenizer/basic-protected-patterns
@@ -0,0 +1,5 @@
+<\/?\S+\/?>
+<\S+( [a-zA-Z0-9]+\=\"?[^\"]\")+ ?\/?>
+<\S+( [a-zA-Z0-9]+\=\'?[^\']\')+ ?\/?>
+[\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}
+(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+
diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl
new file mode 100755
index 000000000..ca4e8a1b3
--- /dev/null
+++ b/scripts/tokenizer/deescape-special-chars-PTB.perl
@@ -0,0 +1,19 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+ s/\&bar;/\|/g; # factor separator (legacy)
+ s/\&#124;/\|/g; # factor separator
+ s/\&lt;/\</g; # xml
+ s/\&gt;/\>/g; # xml
+ s/\&bra;/\[/g; # syntax non-terminal (legacy)
+ s/\&ket;/\]/g; # syntax non-terminal (legacy)
+ s/\&quot;/\"/g; # xml
+ s/\&apos;/\'/g; # xml
+ s/\&#91;/\[/g; # syntax non-terminal
+ s/\&#93;/\]/g; # syntax non-terminal
+ s/\&amp;/\&/g; # escape escape
+ s/\"([^\"]*)\"/\`\`$1\'\'/g;
+ print $_;
+}
diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl
index 76f58714f..58f568b57 100755
--- a/scripts/tokenizer/normalize-punctuation.perl
+++ b/scripts/tokenizer/normalize-punctuation.perl
@@ -2,7 +2,13 @@
use strict;
-my ($language) = @ARGV;
+my $language = "en";
+while (@ARGV) {
+ $_ = shift;
+ /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+ /^-l$/ && ($language = shift, next);
+ /^[^\-]/ && ($language = $_, next);
+}
while(<STDIN>) {
s/\r//g;
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 9e399519a..224f2319c 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -232,15 +232,20 @@ sub tokenize
# Find protected patterns
my @protected = ();
foreach my $protected_pattern (@protected_patterns) {
- foreach ($text =~ /($protected_pattern)/) {
- push @protected, $_;
+ my $t = $text;
+ while ($t =~ /($protected_pattern)(.*)$/) {
+ push @protected, $1;
+ $t = $2;
}
}
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
- $text =~ s,\Q$protected[$i],$subst,g;
+ $text =~ s,\Q$protected[$i], $subst ,g;
}
+ $text =~ s/ +/ /g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
# seperate out all "other" special characters
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl
new file mode 100755
index 000000000..006fb9c2d
--- /dev/null
+++ b/scripts/tokenizer/tokenizer_PTB.perl
@@ -0,0 +1,399 @@
+#!/usr/bin/perl -w
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+use Thread;
+
+my $mydir = "$RealBin/../share/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+
+while (@ARGV)
+{
+ $_ = shift;
+ /^-b$/ && ($| = 1, next);
+ /^-l$/ && ($language = shift, next);
+ /^-q$/ && ($QUIET = 1, next);
+ /^-h$/ && ($HELP = 1, next);
+ /^-x$/ && ($SKIP_XML = 1, next);
+ /^-a$/ && ($AGGRESSIVE = 1, next);
+ /^-time$/ && ($TIMING = 1, next);
+ /^-threads$/ && ($NUM_THREADS = int(shift), next);
+ /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+ $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP)
+{
+ print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+ print "Options:\n";
+ print " -q ... quiet.\n";
+ print " -a ... aggressive hyphen splitting.\n";
+ print " -b ... disable Perl buffering.\n";
+ print " -time ... enable processing time calculation.\n";
+ exit;
+}
+
+if (!$QUIET)
+{
+ print STDERR "Tokenizer Version 1.1\n";
+ print STDERR "Language: $language\n";
+ print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+ print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+ while(<STDIN>)
+ {
+ $count_sentences = $count_sentences + 1;
+ push(@batch_sentences, $_);
+ if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ # reset for the new run
+ @thread_list = ();
+ @batch_sentences = ();
+ }
+ }
+ # the last batch
+ if (scalar(@batch_sentences)>0)
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ if ($start_index >= scalar(@batch_sentences))
+ {
+ last;
+ }
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ if ($end_index >= scalar(@batch_sentences))
+ {
+ $end_index = scalar(@batch_sentences)-1;
+ }
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ }
+}
+else
+{# single thread only
+ while(<STDIN>)
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ print $_;
+ }
+ else
+ {
+ print &tokenize($_);
+ }
+ }
+}
+
+if ($TIMING)
+{
+ my $duration = Time::HiRes::tv_interval( $start_time );
+ print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+ print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array cotaining a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+ my(@text_list) = @_;
+ my(@tokenized_list) = ();
+ foreach (@text_list)
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ push(@tokenized_list, $_);
+ }
+ else
+ {
+ push(@tokenized_list, &tokenize($_));
+ }
+ }
+ return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+ my($text) = @_;
+
+ #clean some stuff so you don't get &amp; -> &amp;amp;
+ #news-commentary stuff
+
+ $text =~ s/\&#45;/ /g;
+ $text =~ s/\&45;/ /g;
+ $text =~ s/\&#160;/ /g;
+ $text =~ s/\&gt;/\>/g;
+ $text =~ s/\&lt;/\</g;
+ $text =~ s/ampquot;/\"/g;
+ $text =~ s/ampquot/\"/g;
+ $text =~ s/\&quot;/\"/g;
+ $text =~ s/\&amp;/\&/g;
+ $text =~ s/\&nbsp;/ /g;
+ $text =~ s/\&#91;/\[/g; # syntax non-terminal
+ $text =~ s/\&#93;/\]/g; # syntax non-terminal
+ $text =~ s/\&bar;/\|/g; # factor separator (legacy)
+ $text =~ s/\&#124;/\|/g; # factor separator
+ $text =~ s/(\.){4,}/ /g; #remove junk like ........
+ $text =~ s/--/ -- /g;
+
+ chomp($text);
+ $text = " $text ";
+
+ # remove ASCII junk
+ $text =~ s/\s+/ /g;
+ $text =~ s/[\000-\037]//g;
+
+ # seperate out all "other" special characters
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+
+ # aggressive hyphen splitting
+ if ($AGGRESSIVE)
+ {
+ $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+ }
+
+ #multi-dots stay together
+ $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+ while($text =~ /DOTMULTI\./)
+ {
+ $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+ $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+ }
+
+ # seperate out "," except if within numbers (5,300)
+ $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ # separate , pre and post number
+ $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+ # turn `into '
+ $text =~ s/\`/\'/g;
+
+ #turn '' into "
+ $text =~ s/\'\'/ \" /g;
+
+ if ($language eq "en")
+ {
+ #split contractions right
+ # $text =~ s/ [']([\p{IsAlpha}])/ '$1/g; #MARIA: is pretokenized for parsing vb'll -> vb 'll
+ $text =~ s/([Dd])'ye/$1o you/g;
+ $text =~ s/([Dd])'you/$1o you/g;
+ $text =~ s/'Tis/It is/g;
+ $text =~ s/'tis/it is/g;
+ $text =~ s/'Twas/It was/g;
+ $text =~ s/'twas/it is/g;
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])['][ ]([sSmMdDtT]\s)/$1 '$2/g; # Commissioner' s
+ $text =~ s/([\p{IsAlpha}])['][ ](ll|ve) /$1 '$2 /g; # I' ve I' ll
+ $text =~ s/ ['] ([sSmMdDtT]\s)/ '$1/g; # Maria 's -> Maria ' s -> Maria 's
+ $text =~ s/ ['] (ll|ve) / '$1 /g; # I 'll I 've
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+ #$text =~ s/ ['] ([\p{IsAlpha}])/ '$1/g; # I 'll 1999 's
+ $text =~ s/([\p{IsAlpha}])n [']t/$1 n't/g; #don't -> do n't (don't first splits into don 't)
+ $text =~ s/([\p{IsAlpha}])n ['] t/$1 n't/g;
+ $text =~ s/([\p{IsAlpha}])n [']t/$1 n't/g;
+ $text =~ s/([\p{IsAlpha}])N [']T/$1 N'T/g;
+ #special case for "1990's"
+ $text =~ s/([\p{IsN}])[']s/$1 's/g;
+ $text =~ s/([\p{IsN}]) [']s/$1 's/g;
+ $text =~ s/([\p{IsN}]) ['] s/$1 's/g;
+
+
+
+ #other english contractions -> from PTB tokenizer.sed
+ $text =~ s/([Cc])annot/$1an not/g;
+ $text =~ s/([Gg])imme/$1im me/g;
+ $text =~ s/([Gg])onna/$1on na/g;
+ $text =~ s/([Gg])otta/$1ot ta/g;
+ $text =~ s/([Ll])emme/$1em me/g;
+ $text =~ s/([Ww])anna/$1an na/g;
+ $text =~ s/([Dd]) 'ye/$1' ye/g;
+
+ }
+ elsif (($language eq "fr") or ($language eq "it"))
+ {
+ #split contractions left
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+ }
+ else
+ {
+ $text =~ s/\'/ \' /g;
+ }
+
+ #word token method
+ my @words = split(/\s/,$text);
+ $text = "";
+ for (my $i=0;$i<(scalar(@words));$i++)
+ {
+ my $word = $words[$i];
+ if ( $word =~ /^(\S+)\.$/)
+ {
+ my $pre = $1;
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+ {
+ #no change
+ }
+ elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+ {
+ #no change
+ }
+ else
+ {
+ $word = $pre." .";
+ }
+ }
+ $text .= $word." ";
+ }
+
+ # clean up extraneous spaces
+ $text =~ s/ +/ /g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ #restore multi-dots
+ while($text =~ /DOTDOTMULTI/)
+ {
+ $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+ }
+ $text =~ s/DOTMULTI/./g;
+
+ #escape special chars
+ $text =~ s/\&/\&amp;/g; # escape escape
+ $text =~ s/\|/\&#124;/g; # factor separator
+ $text =~ s/\</\&lt;/g; # xml
+ $text =~ s/\>/\&gt;/g; # xml
+ $text =~ s/\'/\&apos;/g; # xml
+ $text =~ s/\"/\&quot;/g; # xml
+ $text =~ s/\[/\&#91;/g; # syntax non-terminal
+ $text =~ s/\]/\&#93;/g; # syntax non-terminal
+
+ #ensure final line break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ return $text;
+}
+
+sub load_prefixes
+{
+ my ($language, $PREFIX_REF) = @_;
+
+ my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+ #default back to English if we don't have a language-specific prefix file
+ if (!(-e $prefixfile))
+ {
+ $prefixfile = "$mydir/nonbreaking_prefix.en";
+ print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+ die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+ }
+
+ if (-e "$prefixfile")
+ {
+ open(PREFIX, "<:utf8", "$prefixfile");
+ while (<PREFIX>)
+ {
+ my $item = $_;
+ chomp($item);
+ if (($item) && (substr($item,0,1) ne "#"))
+ {
+ if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+ {
+ $PREFIX_REF->{$1} = 2;
+ }
+ else
+ {
+ $PREFIX_REF->{$item} = 1;
+ }
+ }
+ }
+ close(PREFIX);
+ }
+}
+
diff --git a/scripts/training/bilingual-lm/README b/scripts/training/bilingual-lm/README
new file mode 100644
index 000000000..bf0666243
--- /dev/null
+++ b/scripts/training/bilingual-lm/README
@@ -0,0 +1,9 @@
+Example usage:
+#create training and test corpus
+/home/abmayne/code/deepathon/nnjm/extract_training.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus corpus/europarl.clean.10k --target-language cs --source-language en --align corpus/europarl.clean.10k.align
+/home/abmayne/code/deepathon/nnjm/extract_test.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus corpus/europarl.test.10k --target-language cs --source-language en --align corpus/europarl.test.10k.align
+
+#Train and test different language models with basic nplm training
+/home/abmayne/code/deepathon/nnjm/train_nplm.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus europarl.clean.10k --minibatch-size 128 --epochs 40 --output-model europarl.10k.bbn --nplm-home /home/abmayne/code/deepathon/nplm_one_layer --hidden 0 --threads 1 --output-model europarl.10k.1layer
+/home/abmayne/code/deepathon/nnjm/test_nplm.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus europarl.test.10k --train-corpus europarl.10k.1layer --nplm-home /home/abmayne/code/deepathon/nplm_one_layer --threads 1
+
diff --git a/scripts/training/bilingual-lm/averageNullEmbedding_baseline.py b/scripts/training/bilingual-lm/averageNullEmbedding_baseline.py
new file mode 100755
index 000000000..8fe616b46
--- /dev/null
+++ b/scripts/training/bilingual-lm/averageNullEmbedding_baseline.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python2
+import sys
+import numpy
+import optparse
+#sys.path.append('/data/tools/nplm/python')
+
+parser = optparse.OptionParser("%prog [options]")
+parser.add_option("-p", "--nplm-python-path", type="string", dest="nplm_python_path")
+parser.add_option("-i", "--input-model", type="string", dest="input_model")
+parser.add_option("-o", "--output-model", type="string", dest="output_model")
+parser.add_option("-n", "--null-token-index", type="int", dest="null_idx")
+parser.add_option("-t", "--training-ngrams", type="string", dest="training_ngrams")
+
+
+parser.set_defaults(
+ nplm_python_path = '/mnt/gna0/rsennrich/tools/nplm/python',
+ null_idx = 1
+)
+options,_ = parser.parse_args(sys.argv)
+
+sys.path.append(options.nplm_python_path)
+import nplm
+from collections import defaultdict
+
+def load_model(model_file):
+ return nplm.NeuralLM.from_file(model_file)
+
+def get_weights(path, length):
+ d = [0]*length
+ for line in open(path):
+ last_context = int(line.split()[-2])
+ d[last_context] += 1
+ return d
+
+if __name__ == "__main__":
+
+ a = load_model(options.input_model)
+ print 'before:'
+ print a.input_embeddings[options.null_idx]
+ weights = numpy.array(get_weights(options.training_ngrams, len(a.input_embeddings)))
+ a.input_embeddings[options.null_idx] = numpy.average(numpy.array(a.input_embeddings), weights=weights, axis=0)
+ print 'after:'
+ print a.input_embeddings[options.null_idx]
+ a.to_file(open(options.output_model,'w'))
diff --git a/scripts/training/bilingual-lm/extract.py b/scripts/training/bilingual-lm/extract.py
new file mode 100755
index 000000000..59c202995
--- /dev/null
+++ b/scripts/training/bilingual-lm/extract.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+
+from collections import Counter
+import heapq
+import logging
+import optparse
+import sys
+
+LOG = logging.getLogger(__name__)
+
+BOS = "<s>"
+EOS = "</s>"
+UNK = "<unk>"
+
+def replace_tags(tokens,tags,vocab):
+ for i,t in enumerate(tokens):
+ if not t in vocab:
+ if i < len(tags):
+ tokens[i] = tags[i]
+ else:
+ print "Error: missing tags for index i:", i
+ print ' '.join(tokens)
+ print ' '.join(tags)
+ tokens[i] = UNK
+
+def replace_unks(tokens,vocab):
+ for i,t in enumerate(tokens):
+ if not t in vocab:
+ tokens[i] = UNK
+
+
+def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang, m, n, ofh):
+ """
+ m - source context
+ n - target context
+
+ returns set of tags used
+ """
+ tags = Counter()
+ sfh = open(corpus_stem + "." + slang)
+ tfh = open(corpus_stem + "." + tlang)
+ afh = open(align_file)
+ fhs = [sfh,tfh,afh]
+ if tagged_stem:
+ fhs.append(open(tagged_stem + "." + slang))
+ fhs.append(open(tagged_stem + "." + tlang))
+
+ count = 0
+ ngrams = 0
+ LOG.info("Extracting ngrams")
+ for lines in zip(*fhs):
+ stokens = lines[0][:-1].split()
+ ttokens = lines[1][:-1].split()
+ stokens.append(EOS)
+ ttokens.append(EOS)
+ if tagged_stem:
+ stags = lines[3][:-1].split()
+ ttags = lines[4][:-1].split()
+ stags.append(EOS)
+ ttags.append(EOS)
+ tags.update(stags)
+ tags.update(ttags)
+ replace_tags(stokens,stags,svocab)
+ replace_tags(ttokens,ttags,tvocab)
+ else:
+ replace_unks(stokens,svocab)
+ replace_unks(ttokens,tvocab)
+ # list aligns for each target
+ # Note: align specifies source -> target
+ target_aligns = [[] for t in range(len(ttokens))]
+ for atoken in lines[2][:-1].split():
+ spos,tpos = atoken.split("-")
+ spos,tpos = int(spos), int(tpos)
+ target_aligns[tpos].append(spos)
+ #EOS alignment
+ target_aligns[-1] = [len(stokens)-1]
+
+ for tpos,spos_list in enumerate(target_aligns):
+ # Affiliation heuristics - see Devlin t al. p1371
+ if not spos_list:
+ #tpos has no alignment, look right, then left, then right-right, then left-left etc
+ rpos = tpos+1
+ lpos = tpos-1
+ while rpos < len(ttokens) or lpos >= 0:
+ if rpos < len(ttokens) and target_aligns[rpos]:
+ spos_list = target_aligns[rpos]
+ break
+ if lpos >= 0 and target_aligns[lpos]:
+ spos_list = target_aligns[lpos]
+ break
+ rpos += 1
+ lpos -= 1
+
+ if not spos_list:
+ raise Exception("No alignments in sentence \nSRC: " + lines[0][:-1] + "\nTGT: " + lines[1][:-1])
+ midpos = (len(spos_list)-1) / 2
+ spos = sorted(spos_list)[midpos]
+
+
+ # source-context, target-context, predicted word
+ for i in range(max(0,m-spos)):
+ print>>ofh, BOS,
+ #print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
+ print>>ofh, " ".join([s for s in stokens[max(0,spos-m):spos+m+1]]),
+ for i in range(max(0,spos+m+1-len(stokens))):
+ print>>ofh, EOS,
+ for i in range(max(0,n-(tpos+1))):
+ print>>ofh, BOS,
+ print>>ofh, " ".join([t for t in ttokens[max(0,tpos+1-n):tpos+1]]),
+ print>>ofh
+ ngrams += 1
+
+
+ count += 1
+ if count % 1000 == 0: sys.stderr.write(".")
+ if count % 50000 == 0: sys.stderr.write(" [%d]\n" % count)
+ ofh.close()
+ sys.stderr.write("\n")
+ LOG.info("Extracted %d ngrams" % ngrams)
+ return tags
+
+
diff --git a/scripts/training/bilingual-lm/extract_test.py b/scripts/training/bilingual-lm/extract_test.py
new file mode 100755
index 000000000..c8325e511
--- /dev/null
+++ b/scripts/training/bilingual-lm/extract_test.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+#
+# Create a test corpus, using a previously pruned vocabulary.
+#
+
+import logging
+import optparse
+import os
+import os.path
+import sys
+
+import extract
+
+def read_vocab(filename):
+ vocab = set()
+ for line in open(filename):
+ vocab.add(line[:-1])
+ return vocab
+
+def main():
+ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option("-e", "--target-language", type="string", dest="target_language")
+ parser.add_option("-f", "--source-language", type="string", dest="source_language")
+ parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
+ parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
+ parser.add_option("-a", "--align", type="string", dest="align_file")
+ parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
+
+
+ parser.set_defaults(
+ target_language = "en",
+ source_language = "de",
+ corpus_stem = "test",
+ align_file = "test.align",
+ working_dir = "working",
+ )
+ options,args = parser.parse_args(sys.argv)
+ if not os.path.exists(options.working_dir):
+ LOG.error("Working directory '%s' not found" % working_dir)
+ sys.exit(1)
+
+ m,n = None,None
+ for line in open(options.working_dir + "/info"):
+ name,value = line[:-1].split()
+ if name == "m": m = int(value)
+ if name == "n": n = int(value)
+ if m == None or n == None:
+ LOG.error("info file is incomplete")
+ sys.exit(1)
+
+ svocab = read_vocab(options.working_dir + "/vocab.source")
+ tvocab = read_vocab(options.working_dir + "/vocab.target")
+
+ file_stem = os.path.basename(options.corpus_stem)
+ ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
+ extract.get_ngrams(options.corpus_stem,
+ options.align_file,
+ options.tagged_stem,
+ svocab,
+ tvocab,
+ options.source_language,
+ options.target_language,
+ m,
+ n,
+ ofh)
+
+
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py
new file mode 100755
index 000000000..af272786c
--- /dev/null
+++ b/scripts/training/bilingual-lm/extract_training.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+
+from collections import Counter
+import logging
+import optparse
+import os
+import os.path
+import sys
+
+import extract
+
+LOG = logging.getLogger(__name__)
+
+def get_pruned_vocab(corpus,prune):
+ counts = Counter()
+ LOG.info("Reading vocabulary from %s" % corpus)
+ lines = 0
+ for line in open(corpus):
+ for token in line[:-1].split():
+ counts[token] += 1
+ lines += 1
+ if lines % 1000 == 0: sys.stderr.write(".")
+ if lines % 50000 == 0: sys.stderr.write(" [%d]\n" % lines)
+ sys.stderr.write("\n")
+ counts[extract.BOS] += lines
+ counts[extract.EOS] += lines
+ LOG.info("Vocabulary size: %d" % len(counts))
+ if prune:
+ return Counter(dict(counts.most_common(prune)))
+ else:
+ return counts
+
+def save_vocab(directory, filename, vocab):
+ fh = open(directory + "/" + filename, "w")
+ for word in vocab:
+ print>>fh, word
+
+def main():
+ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option("-e", "--target-language", type="string", dest="target_language")
+ parser.add_option("-f", "--source-language", type="string", dest="source_language")
+ parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
+ parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
+ parser.add_option("-a", "--align", type="string", dest="align_file")
+ parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
+ parser.add_option("-n", "--target-context", type="int", dest="n")
+ parser.add_option("-m", "--source-context", type="int", dest="m")
+ parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
+ parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")
+
+
+ parser.set_defaults(
+ target_language = "en",
+ source_language = "de",
+ corpus_stem = "train.10k",
+ align_file = "train.10k.align",
+ n = 5,
+ m = 4,
+ working_dir = "working",
+ sprune=16000,
+ tprune=16000
+ )
+ options,args = parser.parse_args(sys.argv)
+
+ if not os.path.exists(options.working_dir):
+ os.makedirs(options.working_dir)
+ else:
+ LOG.warn("Directory %s already exists, re-using" % options.working_dir)
+
+ info_file = options.working_dir + "/info"
+ if os.path.exists(info_file):
+ for line in open(info_file):
+ name,value = line[:-1].split()
+ if name == "n" and int(value) != options.n or \
+ name == "m" and int(value) != options.m:
+ LOG.error("info file exists, but parameters do not match. Delete working directory and rerun")
+ sys.exit(1)
+ else:
+ ifh = open(info_file,"w")
+ print>>ifh,"m",options.m
+ print>>ifh,"n",options.n
+ ifh.close()
+
+ scorpus = options.corpus_stem + "." + options.source_language
+ tcorpus = options.corpus_stem + "." + options.target_language
+
+ tvocab,svocab = None,None
+ # Extract vocabulary, and prune, if required
+ svocab = get_pruned_vocab(scorpus,options.sprune)
+ tvocab = get_pruned_vocab(tcorpus,options.tprune)
+
+
+ file_stem = os.path.basename(options.corpus_stem)
+ ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
+ ofh = open(ngram_file, "w")
+
+ tags = extract.get_ngrams(options.corpus_stem,
+ options.align_file,
+ options.tagged_stem,
+ svocab,
+ tvocab,
+ options.source_language,
+ options.target_language,
+ options.m,
+ options.n,
+ ofh)
+
+ # Save vocabularies
+ del svocab["<null>"]
+ del tvocab["<null>"]
+ del svocab["<unk>"]
+ del tvocab["<unk>"]
+ svocab_list = [item[0] for item in svocab.most_common()]
+ tvocab_list = [item[0] for item in tvocab.most_common()]
+
+ # UNK is always the first vocabulary element. Make sure
+ # it appears in position 0
+ # We need to use <null> token in the chart decoder in order
+ # to correctly estimate the probabilities of incomplete subphrases
+ # that are not sentence initial.
+
+ tvocab_list.insert(0, "<null>")
+ tvocab_list.insert(0, "<unk>")
+ svocab_list.insert(0, "<unk>")
+
+ #Get tags:
+ tag_list = [item[0] for item in tags.most_common()]
+ svocab_list = svocab_list + tag_list
+ tvocab_list = tvocab_list + tag_list
+
+ save_vocab(options.working_dir, "vocab.source", svocab_list)
+ save_vocab(options.working_dir, "vocab.target", tvocab_list)
+
+ #Create vocab dictionaries that map word to ID
+ tvocab_idmap = {}
+ for i in range(len(tvocab_list)):
+ tvocab_idmap[tvocab_list[i]] = i
+
+ svocab_idmap = {}
+ for i in range(len(svocab_list)):
+ svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)
+
+ numberized_file = options.working_dir + "/" + file_stem + ".numberized"
+ ngrams_file_handle = open(ngram_file, 'r')
+ numberized_file_handle = open(numberized_file, 'w')
+
+ #Numberize the file
+ for line in ngrams_file_handle:
+ line = line.split()
+ source_words = line[:(2*options.m + 1)]
+ target_words = line[-options.n:]
+
+ numberized_line = ""
+ for item in source_words:
+ numberized_line = numberized_line + str(svocab_idmap[item]) + " "
+
+ for item in target_words:
+ numberized_line = numberized_line + str(tvocab_idmap[item]) + " "
+
+ #Write to file replacing the last space with new line
+ numberized_file_handle.write(numberized_line[:-1] + "\n")
+ numberized_file_handle.close()
+ ngrams_file_handle.close()
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/training/bilingual-lm/reduce_ngrams.py b/scripts/training/bilingual-lm/reduce_ngrams.py
new file mode 100755
index 000000000..65795a10c
--- /dev/null
+++ b/scripts/training/bilingual-lm/reduce_ngrams.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+"""Reduces an ngrams file for training nplm to a smaller version of it with less ngrams"""
+from sys import argv
+
+if len(argv) != 5:
+ print("Wrong number of args, got: " + str(len(argv) - 1) + " expected 4.")
+ print("Usage: reduce_ngrams.py INFILE OUTFILE START_IDX NGRAMS")
+ exit()
+
+INFILE = open(argv[1], 'r')
+OUTFILE = open(argv[2], 'w')
+START_IDX = int(argv[3])
+NGRAMS = int(argv[4])
+
+for line in INFILE:
+ line = line.split()
+ line = line[START_IDX:START_IDX+NGRAMS]
+ linetowrite = ""
+ for token in line:
+ linetowrite = linetowrite + token + " "
+ #Strip final empty space and add newline
+ linetowrite = linetowrite[:-1]
+ linetowrite = linetowrite + '\n'
+ OUTFILE.write(linetowrite)
+
+INFILE.close()
+OUTFILE.close()
diff --git a/scripts/training/bilingual-lm/tag.sh b/scripts/training/bilingual-lm/tag.sh
new file mode 100755
index 000000000..7a8e1dc70
--- /dev/null
+++ b/scripts/training/bilingual-lm/tag.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+WRAP_DIR=~/moses.new/scripts/training/wrappers/
+
+
+tagger=$WRAP_DIR/make-factor-en-pos.mxpost.perl
+lang=en
+for stem in test train.10k train.100k; do
+ $tagger -mxpost /home/pkoehn/statmt/project/mxpost $stem.$lang $stem.tagged.$lang /tmp
+done
+
+tagger=$WRAP_DIR/make-factor-de-pos.perl
+lang=de
+for stem in test train.10k train.100k; do
+ $tagger $stem.$lang $stem.tagged.$lang /tmp
+done
+
diff --git a/scripts/training/bilingual-lm/test_nplm.py b/scripts/training/bilingual-lm/test_nplm.py
new file mode 100755
index 000000000..51b8cebda
--- /dev/null
+++ b/scripts/training/bilingual-lm/test_nplm.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+import logging
+import optparse
+import subprocess
+import sys
+
+
+def main():
+ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option("-w", "--working-dir", dest="working_dir")
+ parser.add_option("-c", "--corpus", dest="corpus_stem")
+ parser.add_option("-r", "--train-corpus", dest="train_stem")
+ parser.add_option("-l", "--nplm-home", dest="nplm_home")
+ parser.add_option("-e", "--epoch", dest="epoch", type="int")
+ parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
+ parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
+ parser.add_option("-t", "--threads", dest="threads", type="int")
+
+ parser.set_defaults(
+ working_dir = "working"
+ ,corpus_stem = "test"
+ ,train_stem = "train.10k"
+ ,nplm_home = "/home/bhaddow/tools/nplm"
+ ,epoch=10
+ ,ngram_size = 14
+ ,minibatch_size=1000
+ ,threads=8
+ )
+
+ options,args = parser.parse_args(sys.argv)
+
+ model_prefix = options.working_dir + "/" + options.train_stem + ".model.nplm"
+ model_file = model_prefix + "." + str(options.epoch)
+ test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
+ prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
+ vocab_file = options.working_dir + "/vocab"
+
+ #TODO: Get ngram size from info file.
+ prep_args = [options.nplm_home + "/src/prepareNeuralLM", "--train_text", test_file, "--ngram_size",
+ str(options.ngram_size), "--ngramize", "0", "--words_file", vocab_file, "--train_file", prep_file]
+ ret = subprocess.call(prep_args)
+ if ret: raise Exception("Preparation failed")
+
+ test_args = [options.nplm_home + "/src/testNeuralNetwork", "--test_file", prep_file, "--model_file",
+ model_file , "--minibatch_size", str(options.minibatch_size), "--num_threads", str(options.threads)]
+ ret = subprocess.call(test_args)
+ if ret: raise Exception("Testing failed")
+
+#$ROOT/src/prepareNeuralLM --train_text $TESTFILE1 --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
+
+#$ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
+
+if __name__ == "__main__":
+ main()
+
diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py
new file mode 100755
index 000000000..1af6cf325
--- /dev/null
+++ b/scripts/training/bilingual-lm/train_nplm.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+import logging
+import optparse
+import subprocess
+import sys
+import os
+
+def main():
+ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+ parser = optparse.OptionParser("%prog [options]")
+ parser.add_option("-w", "--working-dir", dest="working_dir")
+ parser.add_option("-c", "--corpus", dest="corpus_stem")
+ parser.add_option("-l", "--nplm-home", dest="nplm_home")
+ parser.add_option("-e", "--epochs", dest="epochs", type="int")
+ parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
+ parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
+ parser.add_option("-s", "--noise", dest="noise", type="int")
+ parser.add_option("-d", "--hidden", dest="hidden", type="int")
+ parser.add_option("-i", "--input-embedding", dest="input_embedding", type="int")
+ parser.add_option("-o", "--output-embedding", dest="output_embedding", type="int")
+ parser.add_option("-t", "--threads", dest="threads", type="int")
+ parser.add_option("-m", "--output-model", dest="output_model")
+ parser.add_option("-r", "--output-dir", dest="output_dir")
+ parser.add_option("-f", "--config-options-file", dest="config_options_file")
+ parser.add_option("-g", "--log-file", dest="log_file")
+ parser.add_option("-v", "--validation-ngrams", dest="validation_file")
+ parser.add_option("-a", "--activation-function", dest="activation_fn")
+
+ parser.set_defaults(
+ working_dir = "working"
+ ,corpus_stem = "train.10k"
+ ,nplm_home = "/home/bhaddow/tools/nplm"
+ ,epochs = 10
+ ,ngram_size = 14
+ ,minibatch_size=1000
+ ,noise=100
+ ,hidden=750
+ ,input_embedding=150
+ ,output_embedding=150
+ ,threads=1
+ ,output_model = "train.10k"
+ ,output_dir = None
+ ,config_options_file = "config"
+ ,log_file = "log"
+ ,validation_file = None
+ ,activation_fn = "rectifier"
+ )
+
+ options,args = parser.parse_args(sys.argv)
+
+ # Set up validation command variable to use with validation set.
+ validations_command = []
+ if options.validation_file is not None:
+ validations_command =["--validation_file", (options.validation_file + ".numberized")]
+
+
+ # In order to allow for different models to be trained after the same
+ # preparation step, we should provide an option for multiple output directories
+ # If we have not set output_dir, set it to the same thing as the working dir
+
+ if options.output_dir is None:
+ options.output_dir = options.working_dir
+ else:
+ # Create output dir if necessary
+ if not os.path.exists(options.output_dir):
+ os.makedirs(options.output_dir)
+
+ config_file = options.output_dir + "/" + options.config_options_file + '-' + options.output_model
+ log_file = options.output_dir + "/" + options.log_file + '-' + options.output_model
+ log_file_write = open(log_file, 'w')
+ config_file_write = open(config_file, 'w')
+
+ config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
+
+ in_file = options.working_dir + "/" + options.corpus_stem + ".numberized"
+
+
+ model_prefix = options.output_dir + "/" + options.output_model + ".model.nplm"
+ train_args = [options.nplm_home + "/src/trainNeuralNetwork", "--train_file", in_file, "--num_epochs", str(options.epochs),
+ "--model_prefix",
+ model_prefix, "--learning_rate", "1", "--minibatch_size", str(options.minibatch_size),
+ "--num_noise_samples", str(options.noise), "--num_hidden", str(options.hidden), "--input_embedding_dimension",
+ str(options.input_embedding), "--output_embedding_dimension", str(options.output_embedding), "--num_threads",
+ str(options.threads), "--activation_function", options.activation_fn] + validations_command
+ print "Train model command: "
+ print ', '.join(train_args)
+
+ config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
+ config_file_write.close()
+
+ log_file_write.write("Training output:\n")
+ ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write)
+ if ret: raise Exception("Training failed")
+
+ log_file_write.close()
+
+if __name__ == "__main__":
+ main()
+
+
+
+
+#EPOCHS=10
+#NGRAM_SIZE=14
+#MINIBATCH_SIZE=1000
+#NOISE=100
+#HIDDEN=750
+#INPUT_EMBEDDING=150
+#OUTPUT_EMBEDDING=150
+#THREADS=8
+#
+
+#$ROOT/src/prepareNeuralLM --train_text $INFILE --ngram_size $NGRAM_SIZE --ngramize 0 --words_file $VOCAB --train_file $WORKDIR/train.ngrams || exit 1
+
+#$ROOT/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams \
+# --num_epochs $EPOCHS --input_words_file $VOCAB --output_words_file $VOCAB --model_prefix $WORKDIR/$PREFIX \
+# --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
+# --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
+
+
diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl
new file mode 100755
index 000000000..00a56977e
--- /dev/null
+++ b/scripts/training/build-mmsapt.perl
@@ -0,0 +1,23 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+use FindBin qw($RealBin);
+
+my ($DIR,$F,$E,$ALIGNMENT,$CORPUS,$SETTINGS);
+die("ERROR: syntax is --alignment FILE --corpus FILESTEM --f EXT --e EXT --DIR OUTDIR --settings STRING")
+ unless &GetOptions('DIR=s' => \$DIR,
+ 'f=s' => \$F,
+ 'e=s' => \$E,
+ 'corpus=s' => \$CORPUS,
+ 'alignment=s' => \$ALIGNMENT,
+ 'settings=s' => \$SETTINGS)
+ && defined($DIR) && defined($F) && defined($E) && defined($CORPUS) && defined($ALIGNMENT)
+ && -e $ALIGNMENT && -e "$CORPUS.$F" && -e "$CORPUS.$E";
+
+`mkdir $DIR`;
+`$RealBin/../../bin/mtt-build < $CORPUS.$F -i -o $DIR/$F`;
+`$RealBin/../../bin/mtt-build < $CORPUS.$E -i -o $DIR/$E`;
+`$RealBin/../../bin/symal2mam < $ALIGNMENT $DIR/$F-$E.mam`;
+`$RealBin/../../bin/mmlex-build $DIR/ $F $E -o $DIR/$F-$E.lex -c $DIR/$F-$E.cooc`;
+
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index fdb1ad53f..895b64b96 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -113,6 +113,7 @@ while(my $line = <INI>) {
|| $line =~ /PhraseDictionaryBinary /
|| $line =~ /PhraseDictionaryOnDisk /
|| $line =~ /PhraseDictionarySCFG /
+ || $line =~ /RuleTable /
) {
print STDERR "pt:$line\n";
@@ -143,7 +144,7 @@ while(my $line = <INI>) {
}
} #for (my $i = 1; $i < scalar(@toks); ++$i) {
- if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG") || $file =~ /glue-grammar/ || $skip) {
+ if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG" && $phrase_table_impl ne "RuleTable") || $file =~ /glue-grammar/ || $skip) {
# Only Memory ("0") and NewFormat ("6") can be filtered.
print INI_OUT "$line\n";
next;
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 12327f7b2..25d12a8ab 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -127,8 +127,8 @@ my $___NOCASE = 0;
# Use "--nonorm" to non normalize translation before computing scores
my $___NONORM = 0;
-# set 0 if input type is text, set 1 if input type is confusion network
-my $___INPUTTYPE = 0;
+# set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree
+my $___INPUTTYPE;
my $mertdir = undef; # path to new mert directory
@@ -160,6 +160,12 @@ my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loa
# and so on
my $maximum_iterations = 25;
+# Simulated post-editing
+my $___MOSES_SIM_PE = "$SCRIPTS_ROOTDIR/generic/moses_sim_pe.py";
+my $___DEV_SYMAL = undef;
+my $dev_symal_abs = undef;
+my $working_dir_abs = undef;
+
use Getopt::Long;
GetOptions(
"working-dir=s" => \$___WORKING_DIR,
@@ -213,7 +219,8 @@ GetOptions(
"batch-mira-args=s" => \$batch_mira_args,
"promix-training=s" => \$__PROMIX_TRAINING,
"promix-table=s" => \@__PROMIX_TABLES,
- "threads=i" => \$__THREADS
+ "threads=i" => \$__THREADS,
+ "spe-symal=s" => \$___DEV_SYMAL
) or exit(1);
# the 4 required parameters can be supplied on the command line directly
@@ -308,6 +315,8 @@ Options:
--threads=NUMBER ... Use multi-threaded mert (must be compiled in).
--historic-interpolation ... Interpolate optimized weights with prior iterations' weight
(parameter sets factor [0;1] given to current weights)
+ --spe-symal=SYMAL ... Use simulated post-editing when decoding.
+ (SYMAL aligns input to refs)
";
exit 1;
}
@@ -377,15 +386,29 @@ if ($__PROMIX_TRAINING) {
die "To use promix training, need to specify a filter and binarisation command" unless $filtercmd =~ /Binarizer/;
}
-$mertargs = "" if !defined $mertargs;
+if (!defined $mertargs) {
+ if (defined $batch_mira_args) {
+ $mertargs = $batch_mira_args;
+ }
+ else {
+ $mertargs = "";
+ }
+}
my $scconfig = undef;
-if ($mertargs =~ /\-\-scconfig\s+(.+?)(\s|$)/) {
+if ($mertargs =~ /\-\-scconfig(?:\s+|=)(.+?)(\s|$)/) {
$scconfig = $1;
$scconfig =~ s/\,/ /g;
- $mertargs =~ s/\-\-scconfig\s+(.+?)(\s|$)//;
+ $mertargs =~ s/\-\-scconfig(?:\s+|=)(.+?)(\s|$)//;
+}
+
+my $sctype = "--sctype BLEU";
+if ($mertargs =~ /(\-\-sctype(?:\s+|=).+?)(\s|$)/) {
+ $sctype = $1;
+ $mertargs =~ s/(\-\-sctype(?:\s+|=)+.+?)(\s|$)//;
}
+
# handling reference lengh strategy
$scconfig .= &setup_reference_length_type();
@@ -398,8 +421,7 @@ $scconfig =~ s/\s+/,/g;
$scconfig = "--scconfig $scconfig" if ($scconfig);
-my $mert_extract_args = $mertargs;
-$mert_extract_args .= " $scconfig";
+my $mert_extract_args = "$sctype $scconfig";
$extractorargs = "" unless $extractorargs;
$mert_extract_args .= " $extractorargs";
@@ -410,7 +432,7 @@ $proargs = "" unless $proargs;
my $mert_mert_args = "$mertargs $mertmertargs";
$mert_mert_args =~ s/\-+(binary|b)\b//;
-$mert_mert_args .= " $scconfig";
+$mert_mert_args .= "$sctype $scconfig";
if ($___ACTIVATE_FEATURES) {
$mert_mert_args .= " -o \"$___ACTIVATE_FEATURES\"";
}
@@ -467,6 +489,12 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files.";
}
+# Paths needed for simulated post-editing
+$working_dir_abs = ensure_full_path($___WORKING_DIR);
+if (defined $___DEV_SYMAL) {
+ $dev_symal_abs = ensure_full_path($___DEV_SYMAL);
+}
+
# as weights are normalized in the next steps (by cmert)
# normalize initial LAMBDAs, too
my $need_to_normalize = 1;
@@ -863,8 +891,8 @@ while (1) {
$mira_settings .= "$batch_mira_args ";
}
- $mira_settings .= " --dense-init run$run.$weights_in_file";
- #$mira_settings .= " --dense-init run$run.dense";
+ #$mira_settings .= " --dense-init run$run.$weights_in_file";
+ $mira_settings .= " --dense-init run$run.dense";
if (-e "run$run.sparse-weights") {
$mira_settings .= " --sparse-init run$run.sparse-weights";
}
@@ -1098,7 +1126,7 @@ if($___RETURN_BEST_DEV) {
my $bestbleu=0;
my $evalout = "eval.out";
for (my $i = 1; $i < $run; $i++) {
- my $cmd = "$mert_eval_cmd --reference " . join(",", @references) . " -s BLEU --candidate run$i.out";
+ my $cmd = "$mert_eval_cmd --reference " . join(",", @references) . " $mert_extract_args --nbest run$i.best$___N_BEST_LIST_SIZE.out.gz";
$cmd .= " -l $__REMOVE_SEGMENTATION" if defined( $__PROMIX_TRAINING);
safesystem("$cmd 2> /dev/null 1> $evalout");
open my $fh, '<', $evalout or die "Can't read $evalout : $!";
@@ -1228,16 +1256,27 @@ sub run_decoder {
if (defined $___JOBS && $___JOBS > 0) {
die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
- $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
+ $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
+ $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
+ $decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
} else {
- my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE";
+ my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct";
if ($___HG_MIRA) {
safesystem("rm -rf $hypergraph_dir");
$nbest_list_cmd = "-output-search-graph-hypergraph true gz";
}
- $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
+ $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG";
+ $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
+ $decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F";
+ if (defined $___DEV_SYMAL) {
+ # If simulating post-editing, route command through moses_sim_pe.py
+ # Always use single (first) reference. Simulated post-editing undefined for multiple references.
+ $decoder_cmd = "$___MOSES_SIM_PE $decoder_cmd -ref $references[0] -symal $dev_symal_abs -tmp $working_dir_abs > run$run.out";
+ }
+ $decoder_cmd .= " > run$run.out";
}
+ print STDERR "Executing: $decoder_cmd \n";
safesystem($decoder_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n";
if (!$___HG_MIRA) {
@@ -1308,7 +1347,10 @@ sub get_featlist_from_moses {
print STDERR "Using cached features list: $featlistfn\n";
} else {
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
- my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn";
+ my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
+ $cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
+ $cmd .= " -show-weights > $featlistfn";
+ print STDERR "Executing: $cmd\n";
safesystem($cmd) or die "Failed to run moses with the config $configfn";
}
return get_featlist_from_file($featlistfn);
@@ -1605,7 +1647,7 @@ sub create_extractor_script() {
open my $out, '>', $script_path
or die "Couldn't open $script_path for writing: $!\n";
- print $out "#!/bin/bash\n";
+ print $out "#!/usr/bin/env bash\n";
print $out "cd $outdir\n";
print $out "$cmd\n";
close $out;
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 33f075311..6dc65b8f3 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -22,7 +22,7 @@ $SCRIPTS_ROOTDIR =~ s/\/training$//;
#$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_SORT_PARALLEL, $_CORPUS,
- $_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH,
+ $_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH, $_DISTORTION_LIMIT,
$_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_LEXICAL_COUNTS, $_VERBOSE, $_ALIGNMENT,
$_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS,
$_DIRECTION, $_ONLY_PRINT_GIZA, $_GIZA_EXTENSION, $_REORDERING,
@@ -32,12 +32,12 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
- $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
+ $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,$_S2T,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
- $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
- @_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
+ $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_LEXICAL_REORDERING_DEFAULT_SCORES,$_DO_STEPS,
+ @_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,$_MMSAPT,
@_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT,
$_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $_NUM_LATTICE_FEATURES, $IGNORE, $_FLEXIBILITY_SCORE, $_EXTRACT_COMMAND);
my $_BASELINE_CORPUS = "";
@@ -54,6 +54,7 @@ $_HELP = 1
'giza-e2f=s' => \$_GIZA_E2F,
'giza-f2e=s' => \$_GIZA_F2E,
'max-phrase-length=s' => \$_MAX_PHRASE_LENGTH,
+ 'distortion-limit=s' => \$_DISTORTION_LIMIT,
'lexical-file=s' => \$_LEXICAL_FILE,
'no-lexical-weighting' => \$_NO_LEXICAL_WEIGHTING,
'write-lexical-counts' => \$_LEXICAL_COUNTS,
@@ -104,12 +105,17 @@ $_HELP = 1
'generation-type=s' => \@_GENERATION_TYPE,
'continue' => \$_CONTINUE,
'hierarchical' => \$_HIERARCHICAL,
+ 's2t' => \$_S2T,
'glue-grammar' => \$_GLUE_GRAMMAR,
'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE,
'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE,
'unknown-word-soft-matches-file=s' => \$_UNKNOWN_WORD_SOFT_MATCHES_FILE, # give dummy label to unknown word, and allow soft matches to all other labels (with cost determined by sparse features)
'ghkm' => \$_GHKM,
'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
+ 'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION,
+ 'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
+ 'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
+ 'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
'pcfg' => \$_PCFG,
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
@@ -121,10 +127,12 @@ $_HELP = 1
'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
'config=s' => \$_CONFIG,
'osm-model=s' => \$_OSM,
- 'osm-setting=s' => \$_OSM_FACTORS,
- 'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT,
- 'transliteration-phrase-table=s' => \$_TRANSLITERATION_PHRASE_TABLE,
+ 'osm-setting=s' => \$_OSM_FACTORS,
+ 'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT,
+ 'transliteration-phrase-table=s' => \$_TRANSLITERATION_PHRASE_TABLE,
+ 'mmsapt=s' => \$_MMSAPT,
'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
+ 'lexical-reordering-default-scores=s' => \$_LEXICAL_REORDERING_DEFAULT_SCORES,
'do-steps=s' => \$_DO_STEPS,
'memscore:s' => \$_MEMSCORE,
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
@@ -189,6 +197,7 @@ $_GIZA_F2E = File::Spec->rel2abs($_GIZA_F2E) if defined($_GIZA_F2E);
my $_SCORE_OPTIONS; # allow multiple switches
foreach (@_SCORE_OPTIONS) { $_SCORE_OPTIONS .= $_." "; }
chop($_SCORE_OPTIONS) if $_SCORE_OPTIONS;
+
my $_EXTRACT_OPTIONS; # allow multiple switches
foreach (@_EXTRACT_OPTIONS) { $_EXTRACT_OPTIONS .= $_." "; }
chop($_EXTRACT_OPTIONS) if $_EXTRACT_OPTIONS;
@@ -434,11 +443,14 @@ $___CONTINUE = $_CONTINUE if $_CONTINUE;
my $___MAX_PHRASE_LENGTH = "7";
$___MAX_PHRASE_LENGTH = "10" if $_HIERARCHICAL;
+$___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH;
+
+my $___DISTORTION_LIMIT = 6;
+$___DISTORTION_LIMIT = $_DISTORTION_LIMIT if $_DISTORTION_LIMIT;
my $___LEXICAL_WEIGHTING = 1;
my $___LEXICAL_COUNTS = 0;
my $___LEXICAL_FILE = $___MODEL_DIR."/lex";
-$___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH;
$___LEXICAL_WEIGHTING = 0 if $_NO_LEXICAL_WEIGHTING;
$___LEXICAL_COUNTS = 1 if $_LEXICAL_COUNTS;
$___LEXICAL_FILE = $_LEXICAL_FILE if $_LEXICAL_FILE;
@@ -1423,8 +1435,15 @@ sub extract_phrase {
$cmd .= " --PCFG" if $_PCFG;
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
- $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
- if (!defined($_GHKM)) {
+ if (defined($_GHKM))
+ {
+ $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
+ $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
+ $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
+ $cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
+ }
+ else
+ {
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
$cmd .= " --MaxSpan $max_length";
@@ -1548,12 +1567,19 @@ sub score_phrase_phrase_extract {
my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/);
my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/);
my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
+ my $SOURCE_LABELS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabels/);
+ my $SOURCE_LABEL_COUNTS_LHS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelCountsLHS/);
+ my $SOURCE_LABEL_SET = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelSet/);
+ my $SPAN_LENGTH = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SpanLength/);
my $CORE_SCORE_OPTIONS = "";
$CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
$CORE_SCORE_OPTIONS .= " --NegLogProb" if $NEG_LOG_PROB;
$CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
$CORE_SCORE_OPTIONS .= " --Singleton" if $SINGLETON;
$CORE_SCORE_OPTIONS .= " --CrossedNonTerm" if $CROSSEDNONTERM;
+ $CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS;
+ $CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS;
+ $CORE_SCORE_OPTIONS .= " --SourceLabelSet " if $SOURCE_LABEL_SET;
my $substep = 1;
my $isParent = 1;
@@ -1586,6 +1612,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --NoWordAlignment" if $_OMIT_WORD_ALIGNMENT;
$cmd .= " --KneserNey" if $KNESER_NEY;
$cmd .= " --GoodTuring" if $GOOD_TURING && $inverse eq "";
+ $cmd .= " --SpanLength" if $SPAN_LENGTH && $inverse eq "";
$cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
$cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
$cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
@@ -1593,6 +1620,9 @@ sub score_phrase_phrase_extract {
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
+ $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
+ $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
+ $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " $DOMAIN" if $DOMAIN;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
@@ -1643,6 +1673,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN;
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
+ $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " | gzip -c > $ttable_file.gz";
@@ -1947,14 +1978,23 @@ sub create_ini {
$phrase_table_impl_name = "PhraseDictionaryOnDisk" if $phrase_table_impl==2;
$phrase_table_impl_name = "PhraseDictionaryMemory" if $phrase_table_impl==6;
$phrase_table_impl_name = "PhraseDictionaryALSuffixArray" if $phrase_table_impl==10;
+ $phrase_table_impl_name = "PhraseDictionaryBitextSampling" if $phrase_table_impl==11;
+ $file .= "/" if $phrase_table_impl==11 && $file !~ /\/$/;
- #table limit
+ if ($_S2T) {
+ $phrase_table_impl_name = "RuleTable";
+ }
+
+ # table limit (maximum number of translation options per input phrase)
my $table_limit = 0;
if ($i == 0) {
$table_limit = 20;
}
+
# sum up...
- $feature_spec .= "$phrase_table_impl_name name=TranslationModel$i table-limit=$table_limit num-features=$basic_weight_count path=$file input-factor=$input_factor output-factor=$output_factor\n";
+ $feature_spec .= "$phrase_table_impl_name name=TranslationModel$i num-features=$basic_weight_count path=$file input-factor=$input_factor output-factor=$output_factor";
+ $feature_spec .= " L1=$___F L2=$___E ".$_MMSAPT if defined($_MMSAPT); # extra settings for memory mapped suffix array phrase table
+ $feature_spec .= "\n";
$weight_spec .= "TranslationModel$i=";
for(my $j=0;$j<$basic_weight_count;$j++) { $weight_spec .= " 0.2"; }
$weight_spec .= "\n";
@@ -1967,8 +2007,7 @@ sub create_ini {
exit 1 if $i < $stepsused{"T"}; # fatal to define less
}
- if ($_TRANSLITERATION_PHRASE_TABLE){
-
+ if ($_TRANSLITERATION_PHRASE_TABLE) {
$feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i table-limit=100 num-features=4 path=$_TRANSLITERATION_PHRASE_TABLE input-factor=0 output-factor=0\n";
$weight_spec .= "TranslationModel$i= 0.2 0.2 0.2 0.2\n";
$i++;
@@ -1977,7 +2016,11 @@ sub create_ini {
# glue grammar
if ($_GLUE_GRAMMAR) {
&full_path(\$___GLUE_GRAMMAR_FILE);
- $feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i num-features=1 path=$___GLUE_GRAMMAR_FILE input-factor=0 output-factor=0\n";
+ my $feature_name = "PhraseDictionaryMemory";
+ if ($_S2T) {
+ $feature_name = "RuleTable";
+ }
+ $feature_spec .= "$feature_name name=TranslationModel$i num-features=1 path=$___GLUE_GRAMMAR_FILE input-factor=0 output-factor=0\n";
$weight_spec .= "TranslationModel$i= 1.0\n";
}
@@ -2017,7 +2060,7 @@ sub create_ini {
$table_file .= ".";
$table_file .= $model->{"filename"};
$table_file .= ".gz";
- $feature_spec .= "LexicalReordering name=LexicalReordering$i num-features=".$model->{"numfeatures"}." type=".$model->{"config"}." input-factor=$input_factor output-factor=$output_factor path=$table_file\n";
+ $feature_spec .= "LexicalReordering name=LexicalReordering$i num-features=".$model->{"numfeatures"}." type=".$model->{"config"}." input-factor=$input_factor output-factor=$output_factor path=$table_file".(defined($_LEXICAL_REORDERING_DEFAULT_SCORES)?" default-scores=$_LEXICAL_REORDERING_DEFAULT_SCORES":"")."\n";
$weight_spec .= "LexicalReordering$i=";
for(my $j=0;$j<$model->{"numfeatures"};$j++) { $weight_spec .= " 0.3"; }
$weight_spec .= "\n";
@@ -2072,12 +2115,16 @@ sub create_ini {
my $path = `pwd`; chop($path);
$fn = $path."/".$fn;
}
- $type = 0 unless $type;
- my $type_name = "UnknownLM";
- $type_name = "SRILM" if $type == 0;
- $type_name = "IRSTLM" if $type == 1;
- $type_name = "KENLM lazyken=0" if $type == 8;
- $type_name = "KENLM lazyken=1" if $type == 9;
+ $type = "SRILM" unless defined $type; # default to SRILM if no type given
+
+ if ($type =~ /^\d+$/) {
+ # backwards compatibility if the type is given not as string but as a number
+ $type = "SRILM" if $type == 0;
+ $type = "IRSTLM" if $type == 1;
+ $type = "KENLM lazyken=0" if $type == 8;
+ $type = "KENLM lazyken=1" if $type == 9;
+ die "Unknown numeric LM type given: $type" if $type =~ /^\d+$/;
+ }
my $lm_oov_prob = 0.1;
@@ -2086,7 +2133,7 @@ sub create_ini {
$_LMODEL_OOV_FEATURE = "yes";
}
- $feature_spec .= "$type_name name=LM$i factor=$f path=$fn order=$o\n";
+ $feature_spec .= "$type name=LM$i factor=$f path=$fn order=$o\n";
$weight_spec .= "LM$i= 0.5".($_LMODEL_OOV_FEATURE?" $lm_oov_prob":"")."\n";
$i++;
}
@@ -2108,7 +2155,7 @@ sub create_ini {
}
# phrase-based model settings
else {
- print INI "[distortion-limit]\n6\n";
+ print INI "[distortion-limit]\n$___DISTORTION_LIMIT\n";
}
# only set the factor delimiter if it is non-standard
@@ -2143,6 +2190,7 @@ sub create_ini {
print INI "WordPenalty\n";
print INI "PhrasePenalty\n";
print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
+ print INI "SoftSourceSyntacticConstraintsFeature sourceLabelSetFile=$_GHKM_SOURCE_LABELS_FILE\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
print INI $feature_spec;
print INI "\n# dense weights for feature functions\n";
@@ -2150,6 +2198,7 @@ sub create_ini {
print INI "UnknownWordPenalty0= 1\n";
print INI "WordPenalty0= -1\n";
print INI "PhrasePenalty0= 0.2\n";
+ print INI "SoftSourceSyntacticConstraintsFeature0= 0.3 -0.3 -0.3\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
print INI $weight_spec;
close(INI);
}
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
new file mode 100755
index 000000000..e447ee146
--- /dev/null
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
@@ -0,0 +1,54 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+ if (/^\(\(\)\)/) {
+ print "\n"; # parse failures
+ next;
+ }
+
+ # prep
+ s/^\( /\(TOP /;
+
+ # escape words
+ s/\&/\&amp;/g; # escape escape
+ s/\|/\&bar;/g; # factor separator
+ s/\|/\&#124;/g; # factor separator
+ s/\</\&lt;/g; # xml
+ s/\>/\&gt;/g; # xml
+ s/\'\'/\&quot;/g;
+ s/``/\&quot;/g;
+ s/\'/\&apos;/g; # xml
+ s/\"/\&quot;/g; # xml
+ s/\[/\&#91;/g; # syntax non-terminal
+ s/\]/\&#93;/g; # syntax non-terminal
+
+
+ # escape parentheses that were part of the input text
+ s/(\(\S+ )\(\)/$1\&openingparenthesis;\)/g;
+ s/(\(\S+ )\)\)/$1\&closingparenthesis;\)/g;
+
+
+
+ # convert into tree
+ s/\((\S+) /<tree label=\"$1\"> /g;
+ s/\)/ <\/tree> /g;
+ s/\"\-LRB\-\"/\"LRB\"/g; # labels
+ s/\"\-RRB\-\"/\"RRB\"/g;
+ s/\-LRB\-/\(/g; # tokens
+ s/\-RRB\-/\)/g;
+ s/ +/ /g;
+ s/ $//g;
+
+ # de-escape parentheses that were part of the input text
+ s/\&openingparenthesis;/\(/g;
+ s/\&closingparenthesis;/\)/g;
+
+ s/tree label=\"\&quot;\"/tree label=\"QUOT\"/g;
+ #s/tree label=\"''\"/tree label=\"QUOT\"/g;
+ #s/tree label=\"``\"/tree label=\"QUOT\"/g;
+
+ # output, replace words with original
+ print $_;
+}
diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py
new file mode 100755
index 000000000..69ee4f737
--- /dev/null
+++ b/scripts/training/wrappers/conll2mosesxml.py
@@ -0,0 +1,183 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# takes a file in the CoNLL dependency format (from the CoNLL-X shared task on dependency parsing; http://ilk.uvt.nl/conll/#dataformat )
+# and produces Moses XML format. Note that the structure is built based on fields 9 and 10 (projective HEAD and RELATION),
+# which not all parsers produce.
+
+# usage: conll2mosesxml.py [--brackets] < input_file > output_file
+
+from __future__ import print_function, unicode_literals
+import sys
+import re
+import codecs
+from collections import namedtuple,defaultdict
+from lxml import etree as ET
+
+
+Word = namedtuple('Word', ['pos','word','lemma','tag','head','func', 'proj_head', 'proj_func'])
+
+def main(output_format='xml'):
+ sentence = []
+
+ for line in sys.stdin:
+
+ # process sentence
+ if line == "\n":
+ sentence.insert(0,[])
+ if is_projective(sentence):
+ write(sentence,output_format)
+ else:
+ sys.stderr.write(' '.join(w.word for w in sentence[1:]) + '\n')
+ sys.stdout.write('\n')
+ sentence = []
+ continue
+
+ try:
+ pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = line.split()
+ except ValueError: # word may be unicode whitespace
+ pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = re.split(' *\t*',line.strip())
+
+ word = escape_special_chars(word)
+ lemma = escape_special_chars(lemma)
+
+ if proj_head == '_':
+ proj_head = head
+ proj_func = func
+
+ sentence.append(Word(int(pos), word, lemma, tag2,int(head), func, int(proj_head), proj_func))
+
+
+# this script performs the same escaping as escape-special-chars.perl in Moses.
+# most of it is done in function write(), but quotation marks need to be processed first
+def escape_special_chars(line):
+
+ line = line.replace('\'','&apos;') # xml
+ line = line.replace('"','&quot;') # xml
+ line = line.replace('[','&#91;') # syntax non-terminal
+ line = line.replace(']','&#93;') # syntax non-terminal
+
+ return line
+
+
+# make a check if structure is projective
+def is_projective(sentence):
+ dominates = defaultdict(set)
+ for i,w in enumerate(sentence):
+ dominates[i].add(i)
+ if not i:
+ continue
+ head = int(w.proj_head)
+ while head != 0:
+ if i in dominates[head]:
+ break
+ dominates[head].add(i)
+ head = int(sentence[head].proj_head)
+
+ for i in dominates:
+ dependents = dominates[i]
+ if max(dependents) - min(dependents) != len(dependents)-1:
+ sys.stderr.write("error: non-projective structure.\n")
+ return False
+ return True
+
+
+def write(sentence, output_format='xml'):
+
+ if output_format == 'xml':
+ tree = create_subtree(0,sentence)
+ out = ET.tostring(tree, encoding = 'UTF-8').decode('UTF-8')
+
+ if output_format == 'brackets':
+ out = create_brackets(0,sentence)
+
+ out = out.replace('|','&#124;') # factor separator
+
+ out = out.replace('&amp;apos;','&apos;') # lxml is buggy if input is escaped
+ out = out.replace('&amp;quot;','&quot;') # lxml is buggy if input is escaped
+ out = out.replace('&amp;#91;','&#91;') # lxml is buggy if input is escaped
+ out = out.replace('&amp;#93;','&#93;') # lxml is buggy if input is escaped
+
+ print(out)
+
+# write node in Moses XML format
+def create_subtree(position, sentence):
+
+ element = ET.Element('tree')
+
+ if position:
+ element.set('label', sentence[position].proj_func)
+ else:
+ element.set('label', 'sent')
+
+ for i in range(1,position):
+ if sentence[i].proj_head == position:
+ element.append(create_subtree(i, sentence))
+
+ if position:
+
+ if preterminals:
+ head = ET.Element('tree')
+ head.set('label', sentence[position].tag)
+ head.text = sentence[position].word
+ element.append(head)
+
+ else:
+ if len(element):
+ element[-1].tail = sentence[position].word
+ else:
+ element.text = sentence[position].word
+
+ for i in range(position, len(sentence)):
+ if i and sentence[i].proj_head == position:
+ element.append(create_subtree(i, sentence))
+
+ return element
+
+
+# write node in bracket format (Penn treebank style)
+def create_brackets(position, sentence):
+
+ if position:
+ element = "[ " + sentence[position].proj_func + ' '
+ else:
+ element = "[ sent "
+
+ for i in range(1,position):
+ if sentence[i].proj_head == position:
+ element += create_brackets(i, sentence)
+
+ if position:
+ word = sentence[position].word
+ tag = sentence[position].tag
+
+ if preterminals:
+ element += '[ ' + tag + ' ' + word + ' ] '
+ else:
+ element += word + ' ] '
+
+ for i in range(position, len(sentence)):
+ if i and sentence[i].proj_head == position:
+ element += create_brackets(i, sentence)
+
+ if preterminals or not position:
+ element += '] '
+
+ return element
+
+if __name__ == '__main__':
+ if sys.version_info < (3,0,0):
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+
+ if '--no_preterminals' in sys.argv:
+ preterminals = False
+ else:
+ preterminals = True
+
+ if '--brackets' in sys.argv:
+ main('brackets')
+ else:
+ main('xml')
diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
index 60f341de8..38e331737 100755
--- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
+++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
@@ -2,7 +2,7 @@
use strict;
-my ($cluster_file,$in,$out,$tmp) = @ARGV;
+my ($lowercase, $cluster_file,$in,$out,$tmp) = @ARGV;
my $CLUSTER = &read_cluster_from_mkcls($cluster_file);
@@ -17,7 +17,10 @@ while(<IN>) {
s/ $//;
my $first = 1;
foreach my $word (split) {
- my $cluster = defined($$CLUSTER{$word}) ? $$CLUSTER{$word} : "<unk>";
+ if ($lowercase) {
+ $word = lc($word);
+ }
+ my $cluster = defined($$CLUSTER{$word}) ? $$CLUSTER{$word} : "0";
print OUT " " unless $first;
print OUT $cluster;
$first = 0;
@@ -31,6 +34,7 @@ sub read_cluster_from_mkcls {
my ($file) = @_;
my %CLUSTER;
open(CLUSTER_FILE,$file) || die("ERROR: could not open cluster file '$file'");
+ binmode(CLUSTER_FILE, ":utf8");
while(<CLUSTER_FILE>) {
chop;
my ($word,$cluster) = split;
@@ -42,3 +46,5 @@ sub read_cluster_from_mkcls {
sub add_cluster_to_string {
}
+
+
diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl
index 5d4a4d313..03d90eaca 100755
--- a/scripts/training/wrappers/parse-de-berkeley.perl
+++ b/scripts/training/wrappers/parse-de-berkeley.perl
@@ -1,21 +1,28 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl -w
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
-my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE);
+my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE,$UNPARSEABLE);
-die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-split-slash] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n")
+$UNPARSEABLE = 0;
+
+die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-split-slash] [-mark-split] [-binarize] -jar jar-file -gr grammar -unparseable < in > out\n")
unless &GetOptions
('jar=s' => \$JAR,
'gr=s' => \$GRAMMAR,
'split-hyphen' => \$SPLIT_HYPHEN,
'split-slash' => \$SPLIT_SLASH,
'mark-split' => \$MARK_SPLIT,
- 'binarize' => \$BINARIZE)
+ 'binarize' => \$BINARIZE,
+ 'unparseable' => \$UNPARSEABLE
+
+ )
&& defined($JAR) && defined($GRAMMAR);
+#print STDERR "UNPARSEABLE=$UNPARSEABLE\n";
+
die("ERROR: could not find jar file '$JAR'\n") unless -e $JAR;
die("ERROR: could not find grammar file '$GRAMMAR'\n") unless -e $GRAMMAR;
@@ -26,9 +33,15 @@ $SPLIT_SLASH = $SPLIT_SLASH ? "| $RealBin/syntax-hyphen-splitting.perl -slash $B
$SPLIT_SLASH .= " -mark-split" if $SPLIT_SLASH && $MARK_SPLIT;
my $tmp = "/tmp/parse-de-berkeley.$$";
+my $tmpEscaped = "/tmp/parse-de-berkeley.2.$$";
+#print STDERR "tmp=$tmp\n";
+#print STDERR "tmpEscaped=$tmpEscaped\n";
open(TMP,"| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmp");
+open(TMPESCAPED, ">>$tmpEscaped");
while(<STDIN>) {
+ print TMPESCAPED $_;
+
# unsplit hyphens
s/ \@-\@ /-/g if $SPLIT_HYPHEN;
# unsplit slashes
@@ -44,14 +57,30 @@ while(<STDIN>) {
print TMP $_;
}
close(TMP);
+close(TMPESCAPED);
my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $RealBin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN $SPLIT_SLASH";
-print STDERR $cmd."\n";
+#print STDERR "Executing: $cmd \n";
+
+open (TMP, $tmp);
+open (TMPESCAPED, $tmpEscaped);
open(PARSE,"$cmd|");
while(<PARSE>) {
s/\\\@/\@/g;
- print $_;
+ my $outLine = $_;
+ my $unparsedLine = <TMPESCAPED>;
+
+ #print STDERR "unparsedLine=$unparsedLine";
+ #print STDERR "outLine=$outLine" .length($outLine) ."\n";
+
+ if ($UNPARSEABLE == 1 && length($outLine) == 1) {
+ print $unparsedLine;
+ }
+ else {
+ print $outLine;
+ }
}
close(PARSE);
`rm $tmp`;
+`rm $tmpEscaped`;
diff --git a/scripts/training/wrappers/parse-de-bitpar.perl b/scripts/training/wrappers/parse-de-bitpar.perl
index 370187d32..f884b5c01 100755
--- a/scripts/training/wrappers/parse-de-bitpar.perl
+++ b/scripts/training/wrappers/parse-de-bitpar.perl
@@ -15,6 +15,7 @@ my $DEESCAPE = "$SCRIPTS_ROOT_DIR/tokenizer/deescape-special-chars.perl";
my $DEBUG = 0;
my $BASIC = 0;
my $OLD_BITPAR = 0;
+my $UNPARSEABLE = 0;
my $RAW = "";
@@ -22,7 +23,8 @@ GetOptions(
"basic" => \$BASIC,
"bitpar=s" => \$BITPAR,
"old-bitpar" => \$OLD_BITPAR,
- "raw=s" => \$RAW
+ "raw=s" => \$RAW,
+ "unparseable" => \$UNPARSEABLE
) or die("ERROR: unknown options");
`mkdir -p $TMPDIR`;
@@ -71,6 +73,12 @@ if ($OLD_BITPAR)
open(PARSER,$pipeline);
while(my $line = <PARSER>) {
if ($line =~ /^No parse for/) {
+ if ($UNPARSEABLE) {
+ my $len = length($line);
+ $line = substr($line, 15, $len - 17);
+ $line = escape($line);
+ print $line;
+ }
print "\n";
next;
}
diff --git a/scripts/training/wrappers/tagger-german-chunk.perl b/scripts/training/wrappers/tagger-german-chunk.perl
new file mode 100755
index 000000000..1e4b5495d
--- /dev/null
+++ b/scripts/training/wrappers/tagger-german-chunk.perl
@@ -0,0 +1,144 @@
+#!/usr/bin/perl
+
+use strict;
+use Getopt::Long "GetOptions";
+
+# split -a 5 -d ../europarl.clean.5.de
+# ls -1 x????? | ~/workspace/coreutils/parallel/src/parallel /home/s0565741/workspace/treetagger/cmd/run-tagger-chunker-german.sh
+# cat x?????.out > ../out
+
+my $chunkedPath;
+my $treetaggerPath;
+
+GetOptions('chunked=s' => \$chunkedPath,
+ 'tree-tagger=s' => \$treetaggerPath);
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+#my $TMPDIR= "/tmp/chunker.$$";
+my $TMPDIR= "chunker.$$";
+print STDERR "TMPDIR=$TMPDIR\n";
+print STDERR "chunkedPath=$chunkedPath\n";
+`mkdir $TMPDIR`;
+
+my $inPath = "$TMPDIR/in";
+
+open(IN, ">$inPath");
+binmode(IN, ":utf8");
+
+while(my $line = <STDIN>) {
+ chomp($line);
+ print IN "$line\n";
+}
+close(IN);
+
+# call chunker
+if (!defined($chunkedPath)) {
+ if (!defined($treetaggerPath)) {
+ print STDERR "must defined -tree-tagger \n";
+ exit(1);
+ }
+
+ $chunkedPath = "$TMPDIR/chunked";
+ print STDERR "chunkedPath not defined. Now $chunkedPath \n";
+ my $cmd = "$treetaggerPath/cmd/tagger-chunker-german-utf8 < $inPath > $chunkedPath";
+ `$cmd`;
+}
+
+# convert chunked file into Moses XML
+open(CHUNKED, "$chunkedPath");
+open(IN, "$inPath");
+binmode(CHUNKED, ":utf8");
+binmode(IN, ":utf8");
+
+my $sentence = <IN>;
+chomp($sentence);
+my @words = split(/ /, $sentence);
+my $numWords = scalar @words;
+my $prevTag = "";
+my $wordPos = -1;
+
+while(my $chunkLine = <CHUNKED>) {
+ chomp($chunkLine);
+ my @chunkToks = split(/\t/, $chunkLine);
+
+ if (substr($chunkLine, 0, 1) eq "<") {
+ if (substr($chunkLine, 0, 2) eq "</") {
+ # end of tag
+ print "</tree> ";
+ $prevTag = "";
+
+ if ($wordPos == ($numWords - 1)) {
+ # closing bracket of last word in sentence
+ print "\n";
+ $sentence = <IN>;
+ chomp($sentence);
+ @words = split(/ /, $sentence);
+ $numWords = scalar @words;
+ $wordPos = -1;
+ }
+ }
+ else {
+ # beginning of tag
+ if ($wordPos == ($numWords - 1)) {
+ # closing bracket of last word in sentence
+ print "\n";
+ $sentence = <IN>;
+ chomp($sentence);
+ @words = split(/ /, $sentence);
+ $numWords = scalar @words;
+ $wordPos = -1;
+ }
+
+ $prevTag = $chunkToks[0];
+ $prevTag = substr($prevTag, 1, length($prevTag) - 2);
+ print "<tree label=\"$prevTag\">";
+ }
+ }
+ else {
+ # word
+ ++$wordPos;
+
+ if (scalar(@chunkToks) != 3) {
+ # parse error
+ print STDERR "CHUNK LINES SHOULD BE 3 TOKS\n";
+ exit(1);
+ }
+
+ if ($wordPos >= $numWords) {
+ # on new sentence now
+ if (length($prevTag) > 0) {
+ print "</tree>";
+ }
+ print "\n";
+ if (length($prevTag) > 0) {
+ print "<tree label=\"$prevTag\">";
+ }
+
+ $sentence = <IN>;
+ chomp($sentence);
+ @words = split(/ /, $sentence);
+ $numWords = scalar @words;
+ $wordPos = 0;
+ }
+
+ if ($chunkToks[0] ne $words[$wordPos]) {
+ # word in chunk input and sentence should match
+ print STDERR "NOT EQUAL:" .$chunkToks[0] ." != " .$words[$wordPos] ."\n";
+ exit(1);
+ }
+
+ print $chunkToks[0] . " ";
+
+ }
+
+}
+
+print "\n";
+
+close(IN);
+close(CHUNKED);
+
+`rm -rf $TMPDIR`;
+
diff --git a/search/edge_generator.hh b/search/edge_generator.hh
index 203942c6f..048f9f9ac 100644
--- a/search/edge_generator.hh
+++ b/search/edge_generator.hh
@@ -8,7 +8,7 @@
namespace lm {
namespace ngram {
-class ChartState;
+struct ChartState;
} // namespace ngram
} // namespace lm
diff --git a/search/types.hh b/search/types.hh
index f9c849b3f..832ef159f 100644
--- a/search/types.hh
+++ b/search/types.hh
@@ -3,7 +3,7 @@
#include <stdint.h>
-namespace lm { namespace ngram { class ChartState; } }
+namespace lm { namespace ngram { struct ChartState; } }
namespace search {
diff --git a/search/vertex_generator.hh b/search/vertex_generator.hh
index 6fce508d6..328da7933 100644
--- a/search/vertex_generator.hh
+++ b/search/vertex_generator.hh
@@ -7,7 +7,7 @@
namespace lm {
namespace ngram {
-class ChartState;
+struct ChartState;
} // namespace ngram
} // namespace lm
diff --git a/util/exception.hh b/util/exception.hh
index 2fb00667f..4e50a6f3a 100644
--- a/util/exception.hh
+++ b/util/exception.hh
@@ -1,5 +1,5 @@
-#ifndef UTIL_EXCEPTION__
-#define UTIL_EXCEPTION__
+#ifndef UTIL_EXCEPTION_H
+#define UTIL_EXCEPTION_H
#include <exception>
#include <limits>
@@ -84,7 +84,7 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep
UTIL_THROW_BACKEND(NULL, Exception, , Modify);
#define UTIL_THROW2(Modify) \
- UTIL_THROW_BACKEND(NULL, util::Exception, , Modify);
+ UTIL_THROW_BACKEND(NULL, util::Exception, , Modify);
#if __GNUC__ >= 3
#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0)
@@ -146,4 +146,4 @@ inline std::size_t CheckOverflow(uint64_t value) {
} // namespace util
-#endif // UTIL_EXCEPTION__
+#endif // UTIL_EXCEPTION_H
diff --git a/util/file.cc b/util/file.cc
index 25ff8183a..aa61cf9a9 100644
--- a/util/file.cc
+++ b/util/file.cc
@@ -41,9 +41,9 @@ scoped_fd::~scoped_fd() {
}
}
-scoped_FILE::~scoped_FILE() {
- if (file_ && std::fclose(file_)) {
- std::cerr << "Could not close file " << std::endl;
+void scoped_FILE_closer::Close(std::FILE *file) {
+ if (file && std::fclose(file)) {
+ std::cerr << "Could not close file " << file << std::endl;
std::abort();
}
}
diff --git a/util/file.hh b/util/file.hh
index f2bb319d5..7204b6a04 100644
--- a/util/file.hh
+++ b/util/file.hh
@@ -2,6 +2,7 @@
#define UTIL_FILE_H
#include "util/exception.hh"
+#include "util/scoped.hh"
#include "util/string_piece.hh"
#include <cstddef>
@@ -42,29 +43,10 @@ class scoped_fd {
scoped_fd &operator=(const scoped_fd &);
};
-class scoped_FILE {
- public:
- explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {}
-
- ~scoped_FILE();
-
- std::FILE *get() { return file_; }
- const std::FILE *get() const { return file_; }
-
- void reset(std::FILE *to = NULL) {
- scoped_FILE other(file_);
- file_ = to;
- }
-
- std::FILE *release() {
- std::FILE *ret = file_;
- file_ = NULL;
- return ret;
- }
-
- private:
- std::FILE *file_;
+struct scoped_FILE_closer {
+ static void Close(std::FILE *file);
};
+typedef scoped<std::FILE, scoped_FILE_closer> scoped_FILE;
/* Thrown for any operation where the fd is known. */
class FDException : public ErrnoException {
diff --git a/util/read_compressed.cc b/util/read_compressed.cc
index 71ef0e251..cee98040b 100644
--- a/util/read_compressed.cc
+++ b/util/read_compressed.cc
@@ -374,7 +374,6 @@ ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, co
header.resize(original + got);
}
if (header.empty()) {
- hold.release();
return new Complete();
}
switch (DetectMagic(&header[0], header.size())) {
@@ -435,4 +434,15 @@ std::size_t ReadCompressed::Read(void *to, std::size_t amount) {
return internal_->Read(to, amount, *this);
}
+std::size_t ReadCompressed::ReadOrEOF(void *const to_in, std::size_t amount) {
+ uint8_t *to = reinterpret_cast<uint8_t*>(to_in);
+ while (amount) {
+ std::size_t got = Read(to, amount);
+ if (!got) break;
+ to += got;
+ amount -= got;
+ }
+ return to - reinterpret_cast<uint8_t*>(to_in);
+}
+
} // namespace util
diff --git a/util/read_compressed.hh b/util/read_compressed.hh
index 763e6bbd3..767ee94b2 100644
--- a/util/read_compressed.hh
+++ b/util/read_compressed.hh
@@ -62,6 +62,10 @@ class ReadCompressed {
std::size_t Read(void *to, std::size_t amount);
+ // Repeatedly call read to fill a buffer unless EOF is hit.
+ // Return number of bytes read.
+ std::size_t ReadOrEOF(void *const to, std::size_t amount);
+
uint64_t RawAmount() const { return raw_amount_; }
private:
diff --git a/util/scoped.cc b/util/scoped.cc
index 6c5b0c2db..de1d9e940 100644
--- a/util/scoped.cc
+++ b/util/scoped.cc
@@ -32,10 +32,6 @@ void *CallocOrThrow(std::size_t requested) {
return InspectAddr(std::calloc(1, requested), requested, "calloc");
}
-scoped_malloc::~scoped_malloc() {
- std::free(p_);
-}
-
void scoped_malloc::call_realloc(std::size_t requested) {
p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc");
}
diff --git a/util/scoped.hh b/util/scoped.hh
index ae70b6b53..60c36c36a 100644
--- a/util/scoped.hh
+++ b/util/scoped.hh
@@ -4,6 +4,7 @@
#include "util/exception.hh"
#include <cstddef>
+#include <cstdlib>
namespace util {
@@ -16,87 +17,91 @@ class MallocException : public ErrnoException {
void *MallocOrThrow(std::size_t requested);
void *CallocOrThrow(std::size_t requested);
-class scoped_malloc {
+/* Unfortunately, defining the operator* for void * makes the compiler complain.
+ * So scoped is specialized to void. This includes the functionality common to
+ * both, namely everything except reference.
+ */
+template <class T, class Closer> class scoped_base {
public:
- scoped_malloc() : p_(NULL) {}
+ explicit scoped_base(T *p = NULL) : p_(p) {}
- scoped_malloc(void *p) : p_(p) {}
+ ~scoped_base() { Closer::Close(p_); }
- ~scoped_malloc();
-
- void reset(void *p = NULL) {
- scoped_malloc other(p_);
+ void reset(T *p = NULL) {
+ scoped_base other(p_);
p_ = p;
}
- void call_realloc(std::size_t to);
-
- void *get() { return p_; }
- const void *get() const { return p_; }
-
- private:
- void *p_;
-
- scoped_malloc(const scoped_malloc &);
- scoped_malloc &operator=(const scoped_malloc &);
-};
-
-// Hat tip to boost.
-template <class T> class scoped_array {
- public:
- explicit scoped_array(T *content = NULL) : c_(content) {}
-
- ~scoped_array() { delete [] c_; }
-
- T *get() { return c_; }
- const T* get() const { return c_; }
+ T *get() { return p_; }
+ const T *get() const { return p_; }
- T &operator*() { return *c_; }
- const T&operator*() const { return *c_; }
+ T *operator->() { return p_; }
+ const T *operator->() const { return p_; }
- T &operator[](std::size_t idx) { return c_[idx]; }
- const T &operator[](std::size_t idx) const { return c_[idx]; }
-
- void reset(T *to = NULL) {
- scoped_array<T> other(c_);
- c_ = to;
+ T *release() {
+ T *ret = p_;
+ p_ = NULL;
+ return ret;
}
- private:
- T *c_;
+ protected:
+ T *p_;
- scoped_array(const scoped_array &);
- void operator=(const scoped_array &);
+ private:
+ scoped_base(const scoped_base &);
+ scoped_base &operator=(const scoped_base &);
};
-template <class T> class scoped_ptr {
+template <class T, class Closer> class scoped : public scoped_base<T, Closer> {
public:
- explicit scoped_ptr(T *content = NULL) : c_(content) {}
+ explicit scoped(T *p = NULL) : scoped_base<T, Closer>(p) {}
- ~scoped_ptr() { delete c_; }
+ T &operator*() { return *scoped_base<T, Closer>::p_; }
+ const T&operator*() const { return *scoped_base<T, Closer>::p_; }
+};
- T *get() { return c_; }
- const T* get() const { return c_; }
+template <class Closer> class scoped<void, Closer> : public scoped_base<void, Closer> {
+ public:
+ explicit scoped(void *p = NULL) : scoped_base<void, Closer>(p) {}
+};
- T &operator*() { return *c_; }
- const T&operator*() const { return *c_; }
+/* Closer for c functions like std::free and cmph cleanup functions */
+template <class T, void (*clean)(T*)> struct scoped_c_forward {
+ static void Close(T *p) { clean(p); }
+};
+// Call a C function to delete stuff
+template <class T, void (*clean)(T*)> class scoped_c : public scoped<T, scoped_c_forward<T, clean> > {
+ public:
+ explicit scoped_c(T *p = NULL) : scoped<T, scoped_c_forward<T, clean> >(p) {}
+};
- T *operator->() { return c_; }
- const T*operator->() const { return c_; }
+class scoped_malloc : public scoped_c<void, std::free> {
+ public:
+ explicit scoped_malloc(void *p = NULL) : scoped_c<void, std::free>(p) {}
- T &operator[](std::size_t idx) { return c_[idx]; }
- const T &operator[](std::size_t idx) const { return c_[idx]; }
+ void call_realloc(std::size_t to);
+};
- void reset(T *to = NULL) {
- scoped_ptr<T> other(c_);
- c_ = to;
- }
+/* scoped_array using delete[] */
+struct scoped_delete_array_forward {
+ template <class T> static void Close(T *p) { delete [] p; }
+};
+// Hat tip to boost.
+template <class T> class scoped_array : public scoped<T, scoped_delete_array_forward> {
+ public:
+ explicit scoped_array(T *p = NULL) : scoped<T, scoped_delete_array_forward>(p) {}
- private:
- T *c_;
+ T &operator[](std::size_t idx) { return scoped<T, scoped_delete_array_forward>::p_[idx]; }
+ const T &operator[](std::size_t idx) const { return scoped<T, scoped_delete_array_forward>::p_[idx]; }
+};
- scoped_ptr(const scoped_ptr &);
- void operator=(const scoped_ptr &);
+/* scoped_ptr using delete. If only there were a template typedef. */
+struct scoped_delete_forward {
+ template <class T> static void Close(T *p) { delete p; }
+};
+template <class T> class scoped_ptr : public scoped<T, scoped_delete_forward> {
+ public:
+ explicit scoped_ptr(T *p = NULL) : scoped<T, scoped_delete_forward>(p) {}
};
} // namespace util
diff --git a/util/usage.hh b/util/usage.hh
index e578b0a65..5407f5a54 100644
--- a/util/usage.hh
+++ b/util/usage.hh
@@ -13,9 +13,9 @@ double WallTime();
void PrintUsage(std::ostream &to);
// Determine how much physical memory there is. Return 0 on failure.
-uint64_t GuessPhysicalMemory();
+::uint64_t GuessPhysicalMemory();
// Parse a size like unix sort. Sadly, this means the default multiplier is K.
-uint64_t ParseSize(const std::string &arg);
+::uint64_t ParseSize(const std::string &arg);
} // namespace util
#endif // UTIL_USAGE_H